diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..a81c9aec0e462f295110eb8f76889760bf5a4047 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,30 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk5/graph.neff filter=lfs diff=lfs merge=lfs -text +context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff filter=lfs diff=lfs merge=lfs -text +layout_opt/graph.neff filter=lfs diff=lfs merge=lfs -text +layout_opt/model/graph.hlo filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk0/wrapped_neff.hlo filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk4/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk5/graph.neff filter=lfs diff=lfs merge=lfs -text +token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff filter=lfs diff=lfs merge=lfs -text diff --git a/context_encoding_model/_tp0_bk0/command.txt b/context_encoding_model/_tp0_bk0/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcc9bafeac6f2a725ed0fa2c4d20a8bfc3d55cea --- /dev/null +++ b/context_encoding_model/_tp0_bk0/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb --output model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk0/compile_flags.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.json b/context_encoding_model/_tp0_bk0/compile_flags.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.json new file mode 100644 index 0000000000000000000000000000000000000000..1a6628305a5e9608fb3e3fbb8c75d9ca13fde233 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/compile_flags.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk0/global_metric_store.json b/context_encoding_model/_tp0_bk0/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..b74a2d400d99c84094b3b25177ca12c502758d4c --- /dev/null +++ b/context_encoding_model/_tp0_bk0/global_metric_store.json @@ -0,0 +1,1147 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.70232391357422, + "StaticProfiler::AveragePartitionUtilization": 94.02606201171875, + "StaticProfiler::AveragePeUtilization": 96.57791900634766, + "StaticProfiler::LocalizationEfficiency": 96.75444030761719, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.23246002197266, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.037471771240234375, + "AffinePredicateResolution": 0.0048100948333740234, + "AliasDependencyElimination": 0.0002529621124267578, + "AliasDependencyInduction": 0.005568504333496094, + "AliasDependencyReset": 0.11161017417907715, + "BFComputeCutting": 0.0024290084838867188, + "BirCodeGenLoop": 0.32352304458618164, + "CCOpFusion": 0.033486366271972656, + "CanonicalizeConv": 2.7000001864507794e-05, + "CanonicalizeDAGForPGTiling": 0.004197120666503906, + "CanonicalizeForTensorizer": 3.899999865097925e-05, + "CanonicalizeIR": 0.0025298595428466797, + "Canonicalizer": 0.00088900001719594, + "CoalesceCCOp": 0.014135599136352539, + "CommuteConcat": 0.0018744468688964844, + "DMALocalityOpt": 0.01189279556274414, + "DMAProfiler": 0.025990962982177734, + "DMATilingProfiler": 0.015254497528076172, + "DataLocalityOpt": 0.1120154857635498, + "DataStreaming": 0.03728485107421875, + "DeConcat": 0.0022406578063964844, + "DeadCodeElimination": 0.0021486282348632813, + "DeadStoreElimination": 0.0063364505767822266, + "DelinearIndices": 0.0064697265625, + "Delinearization": 0.004486560821533203, + "DelinearizeSPMD": 0.01732611656188965, + "DoNothing": 0.0007770061492919922, + "DramToDramTranspose": 0.02082037925720215, + "DumpGraphAndMetadata": 0.036411285400390625, + "EliminateDivs": 0.01006174087524414, + "ExpandBatchNorm": 0.0024886131286621094, + "ExpandISAMacro": 0.01822209358215332, + "FactorizeBlkDims": 0.07448649406433105, + "FactorizeThreadAxesInFreeDims": 0.0071103572845458984, + "FlattenMacroLoop": 0.009794235229492188, + "GenericAccessSimplifier": 0.0009224414825439453, + "HoistCompute": 7.000000096013537e-06, + "IdentifyCrossPassTensors": 3.600000127335079e-05, + "InferInitValue": 0.12128233909606934, + "InferIntrinsicOnCC": 0.01005697250366211, + "InferNeuronTensor": 0.029047489166259766, + "InferNonlocalTensors": 0.017493009567260742, + "InferPSumTensor": 0.09335684776306152, + "InferShardAxis": 0.26027798652648926, + "InferSharedMemLoc": 0.016659259796142578, + "InlineNativeKernels": 0.002816915512084961, + "InsertCoreBarrier": 0.0162966251373291, + "InsertIOTransposes": 0.019797325134277344, + "InsertImplicitShardAxisBeforeISel": 0.05061173439025879, + "InsertLocalTransposes": 0.004299163818359375, + "InsertOffloadedTransposes": 0.008011579513549805, + "LICM": 0.009003639221191406, + "LateLegalizeInst": 0.035849571228027344, + "LateLegalizePostSplit": 0.013758182525634766, + "LateLowerReshapeOp": 0.0012693405151367188, + "LateLowerTensorOp": 0.002027750015258789, + "LateNeuronInstComb": 0.14670348167419434, + "LayoutPreprocessing": 0.025156497955322266, + "LayoutPreprocessingAndAnalysis": 0.06950831413269043, + "LayoutRequirementAnalysis": 0.0069408416748046875, + "LegalizeCCOpLayout": 0.003494739532470703, + "LegalizeOpLevelAlias": 0.0016810894012451172, + "LegalizePartitionReduce": 0.0026693344116210938, + "LegalizeSundaAccess": 0.08684325218200684, + "LegalizeSundaMacro": 0.10486245155334473, + "LegalizeType": 0.06927132606506348, + "LocalLayoutOpt": 0.012215137481689453, + "LoopFusion": 0.0049479007720947266, + "LoopSplitting": 0.0008144378662109375, + "LowerBroadcast": 0.019241809844970703, + "LowerCCOpBlockAxis": 0.0037145614624023438, + "LowerComplexBroadcast": 0.0070230960845947266, + "LowerIntrinsics": 0.0899801254272461, + "LowerShardAxis": 0.020240068435668945, + "LowerTensorOp": 0.028459787368774414, + "LowerToSendRecv": 0.02129983901977539, + "LowerTranspose": 0.06694269180297852, + "MacroGeneration": 0.03631877899169922, + "MaskPropagation": 0.004620075225830078, + "MemcastMotion": 2.89999989036005e-05, + "MemcpyElimination": 0.04741477966308594, + "MutateDataType": 0.002264261245727539, + "NeuronAliasDependencyInduction": 0.002180337905883789, + "NeuronAliasDependencyReset": 0.08514618873596191, + "NeuronInstComb": 0.05580711364746094, + "NeuronLICM": 0.047100067138671875, + "NeuronLoopFusion": 0.05364656448364258, + "NeuronLoopInterchange": 0.002526521682739258, + "NeuronSimplifier": 0.06896662712097168, + "NeuronSimplifyPredicates": 0.042169809341430664, + "NeuronValueNumbering": 0.025714874267578125, + "OptimizeAliasedCopyChain": 0.0007548332214355469, + "OptimizeNKIKernels": 4.075549602508545, + "PAGLayoutOpt": 0.1111152172088623, + "PComputeCutting": 0.005707263946533203, + "PGLayoutTilingPipeline": 1.204958438873291, + "PGTiling": 0.4116194248199463, + "PadElimination": 0.0003600120544433594, + "ParAxesAnnotation": 0.050878286361694336, + "PartialLoopFusion": 0.0372469425201416, + "PartialSimdFusion": 0.021113157272338867, + "PenguinizeFunctions": 3.199999991920777e-05, + "PerfectLoopNest": 0.007718086242675781, + "PruneFunctions": 3.400000059627928e-05, + "RecognizeOpIdiom": 0.0058002471923828125, + "Recompute": 0.0017511844635009766, + "RelaxPredicates": 0.00795745849609375, + "Rematerialization": 0.0019276142120361328, + "RemoveOptimizationBarriers": 8.50000069476664e-05, + "RemoveShardedPartitionAxes": 0.008410930633544922, + "ReshapeWeights": 0.0063934326171875, + "ResolveAccessConflict": 0.01411294937133789, + "ResolveComplicatePredicates": 0.004876375198364258, + "RewriteReplicationMatmul": 0.0017600059509277344, + "RewriteWeights": 0.004542827606201172, + "SFKVectorizer": 0.3233633041381836, + "ScatterMotion": 5.7999997807201e-05, + "ShardingPropagationAnalysis": 0.06259655952453613, + "SimpleAllReduceTiling": 0.010744571685791016, + "Simplifier": 0.0033507347106933594, + "SimplifyMacroPredicates": 0.056143999099731445, + "SimplifyNeuronTensor": 0.1345655918121338, + "SimplifySlice": 0.001861572265625, + "SimplifyTensor": 0.02954578399658203, + "SpillPSum": 0.11643767356872559, + "SplitAPUnionSets": 0.07312703132629395, + "SplitAccGrp": 0.002663135528564453, + "StaticProfiler": 0.02257680892944336, + "StaticTransposeLocalTensor": 0.003572225570678711, + "SundaISel": 0.10315561294555664, + "TCTransform": 0.0025663375854492188, + "TensorInitialization": 0.00860285758972168, + "TensorOpSimplifier": 0.008630037307739258, + "TensorOpTransform": 0.028581619262695313, + "TensorizerLegalizationPass": 4.600000102072954e-05, + "TileCCOps": 0.00518488883972168, + "TilingProfiler": 0.023342609405517578, + "TransformConvOp": 0.008756637573242188, + "TritiumFusion": 0.13446974754333496, + "ValueNumbering": 0.003237485885620117, + "VectorizeDMA": 0.028183698654174805, + "VectorizeMatMult": 0.015199661254882813, + "VerifySupportedOps": 3.400000059627928e-05, + "WeightCoalescing": 0.01640915870666504, + "ZeroSizeTensorElimination": 0.0001671314239501953, + "algsimp": 0.0017099999822676182, + "batchnorm_expander": 3.400000059627928e-05, + "boundary-marker-removal": 1.2000000424450263e-05, + "call-inliner": 0.0002339999919058755, + "canonicalize-boundary-marker": 1.4999999621068127e-05, + "collective-stream-id-checker": 6.299999949987978e-05, + "comparison-expander": 0.0005050000036135316, + "computation-deduplicator": 5.100000271340832e-05, + "config-lowering": 0.0002690000110305846, + "constant-statistics": 0.000455000001238659, + "constant_folding": 0.00023099999816622585, + "cse": 3.7000001611886546e-05, + "dce": 6.000000212225132e-05, + "dot_decomposer": 0.0009510000236332417, + "dynamic-slice-transpose": 1.2999998943996616e-05, + "eliminate-redundant-compare": 0.00020500000391621143, + "emit-offloaded-dropout": 8.399999933317304e-05, + "flatten-call-graph": 0.0006050000083632767, + "fuse-send-recv": 5.199999577598646e-05, + "hilo-conditional-to-select": 1.4000000192027073e-05, + "hilo::LegalizeAlias": 1.2000000424450263e-05, + "hilo::NeuronInstCombine": 0.0001320000010309741, + "hilo::NeuronOpFusion": 9.099999442696571e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 3.300000025774352e-05, + "hilo::ScheduleFusion": 5.999999757477781e-06, + "hilo::SixtyFourHack": 5.999999848427251e-05, + "hilo::VerifyAliasing": 3.999999989900971e-06, + "hlo-mac-count": 0.012813999317586422, + "instruction-histogram": 0.0005469999741762877, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0009079999872483313, + "io-statistics": 4.400000034365803e-05, + "legalize-ccops-for-tensorizer": 3.999999989900971e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 9.999999747378752e-06, + "map-inline": 0.0007319999858736992, + "metadata-naming": 4.3000000005122274e-05, + "mlir::detail::OpToOpPassAdaptor": 7.100000220816582e-05, + "mlir::hlo::MhloToPyPenguin": 0.006075000390410423, + "mlir::mhlo::LowerComplexExtraPass": 0.0002460000105202198, + "mlir::mhlo::LowerComplexPass": 0.00047699996503069997, + "native-to-custom-softmax": 0.0005559999844990671, + "native-to-custom-softmax-dx": 0.0005599999567493796, + "neuron-hlo-verifier": 0.010796000249683857, + "operand_upcaster": 4.199999966658652e-05, + "opt-barrier-removal": 0.00039500001003034413, + "post-par-pipe-begin": 4.70000013592653e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.001361000002361834, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.05799899995326996, + "replace-minimum-constant": 0.0003459999861661345, + "reshape-mover": 8.900000102585182e-05, + "simplify-concat": 0.00010900000052060932, + "simplify-while-loops": 5.900000178371556e-05, + "transform-variadic-reduce": 5.699999746866524e-05, + "tuple-simplifier": 0.00020900000527035445, + "unpack-nested-aws-ntwsr": 0.00026500000967644155, + "unroll-while-loop": 9.000000318337698e-06, + "zero_sized_hlo_elimination": 0.0007340000011026859 + }, + "hilo": { + "ConstantSize": 238229.0, + "HloInputCount": 371.0, + "HloMacCount": 6666190848.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910913024.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 864804480.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 20773.0, + "StaticProfiler::AifUb": 131.73849487304688, + "StaticProfiler::ArithmeticIntensityTensorizer": 127.46285247802734, + "StaticProfiler::AverageDmaLength": 2400.2490234375, + "StaticProfiler::DDRTransferBytes": 361746464.0, + "StaticProfiler::InternalTransferBytes": 320526112.0, + "StaticProfiler::LoadExpanded": 84060.0, + "StaticProfiler::StoreExpanded": 1898.0, + "StaticProfiler::TotalDMAExpanded": 85958.0, + "StaticProfiler::TotalDynamicInstancesCount": 25131.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24680.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 10368.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 10147.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 642.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 92.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.001560000004246831, + "call-inliner": 0.0002099999983329326, + "collective-stream-id-checker": 5.6000000768108293e-05, + "comparison-expander": 0.0004900000058114529, + "constant-statistics": 0.000455000001238659, + "constant_folding": 0.00020900000527035445, + "dce": 5.700000110664405e-05, + "dot_decomposer": 0.0009510000236332417, + "eliminate-redundant-compare": 0.00019500000053085387, + "flatten-call-graph": 0.0005799999926239252, + "hlo-mac-count": 0.00829899962991476, + "instruction-histogram": 0.0005469999741762877, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0009079999872483313, + "io-statistics": 4.400000034365803e-05, + "map-inline": 0.0007019999902695417, + "native-to-custom-softmax": 0.0005370000144466758, + "native-to-custom-softmax-dx": 0.00047599998652003706, + "neuron-hlo-verifier": 0.009705999866127968, + "opt-barrier-removal": 0.00039500001003034413, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.05799899995326996, + "replace-minimum-constant": 0.0003279999946244061, + "reshape-mover": 7.999999797903001e-05, + "simplify-while-loops": 5.2999999752501026e-05, + "tuple-simplifier": 0.00019700000120792538, + "unpack-nested-aws-ntwsr": 0.00025400001322850585, + "unroll-while-loop": 9.000000318337698e-06, + "zero_sized_hlo_elimination": 0.0007340000011026859 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00032806396484375, + "DMALocalityOpt": 0.00027751922607421875, + "DMAProfiler": 0.0011353492736816406, + "DataStreaming": 0.00044035911560058594, + "DoNothing": 0.0001888275146484375, + "ExpandISAMacro": 0.003916263580322266, + "FactorizeBlkDims": 0.001834869384765625, + "InferPSumTensor": 0.0010616779327392578, + "InferSharedMemLoc": 0.00044918060302734375, + "InsertCoreBarrier": 0.0004329681396484375, + "LateLegalizeInst": 0.002650022506713867, + "LateNeuronInstComb": 0.002856016159057617, + "LegalizeSundaAccess": 0.002493619918823242, + "LegalizeType": 0.0004024505615234375, + "LowerBroadcast": 0.00041794776916503906, + "LowerIntrinsics": 0.0003495216369628906, + "LowerTranspose": 0.00037598609924316406, + "NeuronInstComb": 0.0011763572692871094, + "NeuronLICM": 0.0014426708221435547, + "NeuronSimplifyPredicates": 0.012172937393188477, + "NeuronValueNumbering": 0.0006816387176513672, + "SFKVectorizer": 0.011650562286376953, + "SimpleAllReduceTiling": 0.00033855438232421875, + "SimplifyNeuronTensor": 0.0009646415710449219, + "SpillPSum": 0.0025339126586914063, + "WeightCoalescing": 0.0003387928009033203 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 7.000000096013537e-06, + "CanonicalizeForTensorizer": 1.5999999959603883e-05, + "Canonicalizer": 0.00033000000985339284, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 1.4000000192027073e-05, + "MemcastMotion": 9.999999747378752e-06, + "PenguinizeFunctions": 1.4999999621068127e-05, + "PruneFunctions": 1.4999999621068127e-05, + "RemoveOptimizationBarriers": 3.300000025774352e-05, + "ScatterMotion": 2.2000000171829015e-05, + "TensorizerLegalizationPass": 2.8000000384054147e-05, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 5.199999941396527e-05, + "batchnorm_expander": 1.1000000085914508e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.000000096013537e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 1.9999999949504854e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.5999999959603883e-05, + "config-lowering": 0.0001289999927394092, + "constant_folding": 7.000000096013537e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 4.999999873689376e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 4.5000000682193786e-05, + "flatten-call-graph": 7.999999979801942e-06, + "fuse-send-recv": 1.8000000636675395e-05, + "hilo-conditional-to-select": 3.999999989900971e-06, + "hilo::LegalizeAlias": 6.000000212225132e-06, + "hilo::NeuronInstCombine": 6.70000008540228e-05, + "hilo::NeuronOpFusion": 4.099999932805076e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.2000000424450263e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 1.8999999156221747e-05, + "legalize-ccops-for-tensorizer": 1.9999999949504854e-06, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 9.000000318337698e-06, + "metadata-naming": 1.2999999853491317e-05, + "mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05, + "mlir::hlo::MhloToPyPenguin": 0.0009730000165291131, + "mlir::mhlo::LowerComplexExtraPass": 8.399999933317304e-05, + "mlir::mhlo::LowerComplexPass": 0.000195999993593432, + "native-to-custom-softmax": 9.000000318337698e-06, + "native-to-custom-softmax-dx": 5.500000042957254e-05, + "neuron-hlo-verifier": 0.0003929999948013574, + "operand_upcaster": 1.700000029813964e-05, + "post-par-pipe-begin": 4.400000034365803e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.00047500000800937414, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 3.400000059627928e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.999999979801942e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 4.265669345855713, + "ConstantSize": 238229.0, + "HloInputCount": 371.0, + "HloMacCount": 838860800.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910913024.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 393307936.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.04803347587585449, + "AffinePredicateResolution": 0.0014185905456542969, + "AliasDependencyElimination": 0.0002288818359375, + "AliasDependencyInduction": 0.023572683334350586, + "AliasDependencyReset": 0.050307273864746094, + "BFComputeCutting": 0.0020284652709960938, + "BirCodeGenLoop": 0.06627583503723145, + "CCOpFusion": 0.030767440795898438, + "CanonicalizeDAGForPGTiling": 0.005156278610229492, + "CanonicalizeIR": 0.0024123191833496094, + "CoalesceCCOp": 0.017067909240722656, + "CommuteConcat": 0.0011420249938964844, + "DMALocalityOpt": 0.0021338462829589844, + "DMAProfiler": 0.015033483505249023, + "DMATilingProfiler": 0.006984710693359375, + "DataLocalityOpt": 0.3054013252258301, + "DataStreaming": 0.014647245407104492, + "DeConcat": 0.005982398986816406, + "DeadCodeElimination": 0.0018534660339355469, + "DeadStoreElimination": 0.04532670974731445, + "DelinearIndices": 0.028018474578857422, + "Delinearization": 0.0051403045654296875, + "DelinearizeSPMD": 0.03557705879211426, + "DoNothing": 0.00012373924255371094, + "DramToDramTranspose": 0.030788660049438477, + "DumpGraphAndMetadata": 0.008297920227050781, + "EliminateDivs": 0.003348112106323242, + "ExpandBatchNorm": 0.002971172332763672, + "ExpandISAMacro": 0.007505178451538086, + "FactorizeBlkDims": 0.052065372467041016, + "FactorizeThreadAxesInFreeDims": 0.006781101226806641, + "FlattenMacroLoop": 0.006749868392944336, + "GenericAccessSimplifier": 0.0015370845794677734, + "InferInitValue": 0.13031220436096191, + "InferIntrinsicOnCC": 0.01256871223449707, + "InferNeuronTensor": 0.07101988792419434, + "InferNonlocalTensors": 0.0933828353881836, + "InferPSumTensor": 0.09560966491699219, + "InferShardAxis": 0.312000036239624, + "InferSharedMemLoc": 0.006642341613769531, + "InlineNativeKernels": 0.0033979415893554688, + "InsertCoreBarrier": 0.008008241653442383, + "InsertIOTransposes": 0.018876314163208008, + "InsertImplicitShardAxisBeforeISel": 0.016681194305419922, + "InsertLocalTransposes": 0.009229898452758789, + "InsertOffloadedTransposes": 0.05370330810546875, + "LICM": 0.007573604583740234, + "LateLegalizeInst": 0.01623988151550293, + "LateLegalizePostSplit": 0.007147073745727539, + "LateLowerReshapeOp": 0.0011415481567382813, + "LateLowerTensorOp": 0.0066013336181640625, + "LateNeuronInstComb": 0.12343692779541016, + "LayoutPreprocessing": 0.02958393096923828, + "LayoutPreprocessingAndAnalysis": 0.14548635482788086, + "LayoutRequirementAnalysis": 0.007357358932495117, + "LegalizeCCOpLayout": 0.0018928050994873047, + "LegalizeOpLevelAlias": 0.001081228256225586, + "LegalizePartitionReduce": 0.003218412399291992, + "LegalizeSundaAccess": 0.08743572235107422, + "LegalizeSundaMacro": 0.04705023765563965, + "LegalizeType": 0.009063720703125, + "LocalLayoutOpt": 0.017424583435058594, + "LoopFusion": 0.006888866424560547, + "LoopSplitting": 0.0018482208251953125, + "LowerBroadcast": 0.00490117073059082, + "LowerCCOpBlockAxis": 0.004808902740478516, + "LowerComplexBroadcast": 0.007742166519165039, + "LowerIntrinsics": 0.04466986656188965, + "LowerShardAxis": 0.008558988571166992, + "LowerTensorOp": 0.011698722839355469, + "LowerToSendRecv": 0.01171255111694336, + "LowerTranspose": 0.012961864471435547, + "MacroGeneration": 0.07335543632507324, + "MaskPropagation": 0.004875659942626953, + "MemcpyElimination": 0.19086575508117676, + "MutateDataType": 0.002115011215209961, + "NeuronAliasDependencyInduction": 0.0007119178771972656, + "NeuronAliasDependencyReset": 0.0555264949798584, + "NeuronInstComb": 0.03685903549194336, + "NeuronLICM": 0.02129840850830078, + "NeuronLoopFusion": 0.04936552047729492, + "NeuronLoopInterchange": 0.008442163467407227, + "NeuronSimplifier": 0.020423412322998047, + "NeuronSimplifyPredicates": 0.013469934463500977, + "NeuronValueNumbering": 0.011552095413208008, + "OptimizeAliasedCopyChain": 0.0006189346313476563, + "OptimizeNKIKernels": 0.0030050277709960938, + "PAGLayoutOpt": 0.4311056137084961, + "PComputeCutting": 0.008741617202758789, + "PGLayoutTilingPipeline": 1.7890496253967285, + "PGTiling": 0.33126235008239746, + "PadElimination": 0.0006849765777587891, + "ParAxesAnnotation": 0.3421931266784668, + "PartialLoopFusion": 0.05652737617492676, + "PartialSimdFusion": 0.04400372505187988, + "PerfectLoopNest": 0.007196664810180664, + "RecognizeOpIdiom": 0.003924369812011719, + "Recompute": 0.0004436969757080078, + "RelaxPredicates": 0.006342649459838867, + "Rematerialization": 0.006484508514404297, + "RemoveShardedPartitionAxes": 0.03604388236999512, + "ReshapeWeights": 0.002611398696899414, + "ResolveAccessConflict": 0.01564621925354004, + "ResolveComplicatePredicates": 0.0013320446014404297, + "RewriteReplicationMatmul": 0.008888483047485352, + "RewriteWeights": 0.005518674850463867, + "SFKVectorizer": 0.23942208290100098, + "ShardingPropagationAnalysis": 0.06231117248535156, + "SimpleAllReduceTiling": 0.008965253829956055, + "Simplifier": 0.009177446365356445, + "SimplifyMacroPredicates": 0.03521132469177246, + "SimplifyNeuronTensor": 0.022907257080078125, + "SimplifySlice": 0.001043081283569336, + "SimplifyTensor": 0.028610706329345703, + "SpillPSum": 0.041993141174316406, + "SplitAPUnionSets": 0.06584334373474121, + "SplitAccGrp": 0.005825042724609375, + "StaticProfiler": 0.013434648513793945, + "StaticTransposeLocalTensor": 0.008102178573608398, + "SundaISel": 0.12313151359558105, + "TCTransform": 0.0010597705841064453, + "TensorInitialization": 0.024387359619140625, + "TensorOpSimplifier": 0.006582498550415039, + "TensorOpTransform": 0.06252408027648926, + "TileCCOps": 0.016498565673828125, + "TilingProfiler": 0.06818985939025879, + "TransformConvOp": 0.0028336048126220703, + "TritiumFusion": 0.01378488540649414, + "ValueNumbering": 0.0024378299713134766, + "VectorizeDMA": 0.042115211486816406, + "VectorizeMatMult": 0.008977413177490234, + "WeightCoalescing": 0.005861759185791016, + "ZeroSizeTensorElimination": 0.00017881393432617188 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 416.0, + "StaticProfiler::AifUb": 5.140732288360596, + "StaticProfiler::ArithmeticIntensityTensorizer": 143.96510314941406, + "StaticProfiler::AverageDmaLength": 2013.53125, + "StaticProfiler::AverageFractalPeUtilization": 99.74824523925781, + "StaticProfiler::AveragePartitionUtilization": 99.1868667602539, + "StaticProfiler::AveragePeUtilization": 99.49378204345703, + "StaticProfiler::DDRTransferBytes": 16395014.0, + "StaticProfiler::InternalTransferBytes": 10682368.0, + "StaticProfiler::LoadExpanded": 3459.0, + "StaticProfiler::LocalizationEfficiency": 2800.478271484375, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 3271.21728515625, + "StaticProfiler::StoreExpanded": 1537.0, + "StaticProfiler::TotalDMAExpanded": 4996.0, + "StaticProfiler::TotalDynamicInstancesCount": 801.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 800.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 10.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 253.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 5.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 56.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 32.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 24.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 47.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.19573044776916504, + "AffinePredicateResolution": 0.0011768341064453125, + "AliasDependencyElimination": 0.00014972686767578125, + "AliasDependencyInduction": 0.02374124526977539, + "AliasDependencyReset": 0.05898928642272949, + "BFComputeCutting": 0.0019648075103759766, + "BirCodeGenLoop": 0.04745078086853027, + "CCOpFusion": 0.034403324127197266, + "CanonicalizeDAGForPGTiling": 0.013227224349975586, + "CanonicalizeIR": 0.0016665458679199219, + "CoalesceCCOp": 0.008426904678344727, + "CommuteConcat": 0.0011937618255615234, + "DMALocalityOpt": 0.0020418167114257813, + "DMAProfiler": 0.0212709903717041, + "DMATilingProfiler": 0.007970333099365234, + "DataLocalityOpt": 0.31763386726379395, + "DataStreaming": 0.013140678405761719, + "DeConcat": 0.006093025207519531, + "DeadCodeElimination": 0.0022492408752441406, + "DeadStoreElimination": 0.03447914123535156, + "DelinearIndices": 0.017621278762817383, + "Delinearization": 0.006613731384277344, + "DelinearizeSPMD": 0.036255598068237305, + "DoNothing": 9.298324584960938e-05, + "DramToDramTranspose": 0.011357545852661133, + "DumpGraphAndMetadata": 0.0038836002349853516, + "EliminateDivs": 0.007913589477539063, + "ExpandBatchNorm": 0.0027163028717041016, + "ExpandISAMacro": 0.006444692611694336, + "FactorizeBlkDims": 0.023404359817504883, + "FactorizeThreadAxesInFreeDims": 0.011568069458007813, + "FlattenMacroLoop": 0.012357473373413086, + "GenericAccessSimplifier": 0.0020608901977539063, + "InferInitValue": 0.10583114624023438, + "InferIntrinsicOnCC": 0.00994729995727539, + "InferNeuronTensor": 0.04976606369018555, + "InferNonlocalTensors": 0.04819130897521973, + "InferPSumTensor": 0.0679934024810791, + "InferShardAxis": 0.6268763542175293, + "InferSharedMemLoc": 0.005129814147949219, + "InlineNativeKernels": 0.009308338165283203, + "InsertCoreBarrier": 0.00969243049621582, + "InsertIOTransposes": 0.03561210632324219, + "InsertImplicitShardAxisBeforeISel": 0.017783164978027344, + "InsertLocalTransposes": 0.012435436248779297, + "InsertOffloadedTransposes": 0.008218526840209961, + "LICM": 0.011756420135498047, + "LateLegalizeInst": 0.012684106826782227, + "LateLegalizePostSplit": 0.0054225921630859375, + "LateLowerReshapeOp": 0.002172231674194336, + "LateLowerTensorOp": 0.003939151763916016, + "LateNeuronInstComb": 0.07796549797058105, + "LayoutPreprocessing": 0.09417939186096191, + "LayoutPreprocessingAndAnalysis": 0.15397191047668457, + "LayoutRequirementAnalysis": 0.03167152404785156, + "LegalizeCCOpLayout": 0.001916646957397461, + "LegalizeOpLevelAlias": 0.00103759765625, + "LegalizePartitionReduce": 0.002568960189819336, + "LegalizeSundaAccess": 0.03490424156188965, + "LegalizeSundaMacro": 0.04486250877380371, + "LegalizeType": 0.010438203811645508, + "LocalLayoutOpt": 0.037950992584228516, + "LoopFusion": 0.00687098503112793, + "LoopSplitting": 0.002494335174560547, + "LowerBroadcast": 0.0028448104858398438, + "LowerCCOpBlockAxis": 0.016790151596069336, + "LowerComplexBroadcast": 0.003789663314819336, + "LowerIntrinsics": 0.06158947944641113, + "LowerShardAxis": 0.009115934371948242, + "LowerTensorOp": 0.011396646499633789, + "LowerToSendRecv": 0.00603795051574707, + "LowerTranspose": 0.030293703079223633, + "MacroGeneration": 0.14122748374938965, + "MaskPropagation": 0.007950544357299805, + "MemcpyElimination": 0.18889641761779785, + "MutateDataType": 0.0014033317565917969, + "NeuronAliasDependencyInduction": 0.0007326602935791016, + "NeuronAliasDependencyReset": 0.025636672973632813, + "NeuronInstComb": 0.0452880859375, + "NeuronLICM": 0.027920246124267578, + "NeuronLoopFusion": 0.07481861114501953, + "NeuronLoopInterchange": 0.004810810089111328, + "NeuronSimplifier": 0.027257442474365234, + "NeuronSimplifyPredicates": 0.011795282363891602, + "NeuronValueNumbering": 0.013232946395874023, + "OptimizeAliasedCopyChain": 0.000640869140625, + "OptimizeNKIKernels": 0.007096529006958008, + "PAGLayoutOpt": 0.25133657455444336, + "PComputeCutting": 0.02008199691772461, + "PGLayoutTilingPipeline": 2.1073567867279053, + "PGTiling": 0.5283112525939941, + "PadElimination": 0.0005664825439453125, + "ParAxesAnnotation": 0.16274571418762207, + "PartialLoopFusion": 0.07154703140258789, + "PartialSimdFusion": 0.05425691604614258, + "PerfectLoopNest": 0.007505655288696289, + "RecognizeOpIdiom": 0.004193305969238281, + "Recompute": 0.0005002021789550781, + "RelaxPredicates": 0.0031478404998779297, + "Rematerialization": 0.002758502960205078, + "RemoveShardedPartitionAxes": 0.05587267875671387, + "ReshapeWeights": 0.0015969276428222656, + "ResolveAccessConflict": 0.021365642547607422, + "ResolveComplicatePredicates": 0.0011401176452636719, + "RewriteReplicationMatmul": 0.0025501251220703125, + "RewriteWeights": 0.014093399047851563, + "SFKVectorizer": 0.51774001121521, + "ShardingPropagationAnalysis": 0.030755996704101563, + "SimpleAllReduceTiling": 0.003780364990234375, + "Simplifier": 0.006270885467529297, + "SimplifyMacroPredicates": 0.01894402503967285, + "SimplifyNeuronTensor": 0.036655426025390625, + "SimplifySlice": 0.0019352436065673828, + "SimplifyTensor": 0.033560752868652344, + "SpillPSum": 0.03554582595825195, + "SplitAPUnionSets": 0.039057016372680664, + "SplitAccGrp": 0.002908468246459961, + "StaticProfiler": 0.009857654571533203, + "StaticTransposeLocalTensor": 0.014261007308959961, + "SundaISel": 0.07885026931762695, + "TCTransform": 0.0012857913970947266, + "TensorInitialization": 0.011929512023925781, + "TensorOpSimplifier": 0.007134199142456055, + "TensorOpTransform": 0.05220603942871094, + "TileCCOps": 0.006574392318725586, + "TilingProfiler": 0.03860926628112793, + "TransformConvOp": 0.002733469009399414, + "TritiumFusion": 0.08646178245544434, + "ValueNumbering": 0.003155946731567383, + "VectorizeDMA": 0.029859304428100586, + "VectorizeMatMult": 0.011672019958496094, + "WeightCoalescing": 0.004624366760253906, + "ZeroSizeTensorElimination": 0.0002124309539794922 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 1427.0, + "StaticProfiler::AifUb": 40.19935607910156, + "StaticProfiler::ArithmeticIntensityTensorizer": 134.3648223876953, + "StaticProfiler::AverageDmaLength": 4238.58251953125, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.61003112792969, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 55879176.0, + "StaticProfiler::InternalTransferBytes": 9895936.0, + "StaticProfiler::LoadExpanded": 9729.0, + "StaticProfiler::LocalizationEfficiency": 334.2462158203125, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 357.74188232421875, + "StaticProfiler::StoreExpanded": 769.0, + "StaticProfiler::TotalDMAExpanded": 10498.0, + "StaticProfiler::TotalDynamicInstancesCount": 1799.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1799.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 8.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 1116.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 66.0, + "TilingProfiler::PfTransposeInstructionsForIo": 18.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 16.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 32.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 87.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.037471771240234375, + "AffinePredicateResolution": 0.0048100948333740234, + "AliasDependencyElimination": 0.0002529621124267578, + "AliasDependencyInduction": 0.005568504333496094, + "AliasDependencyReset": 0.11161017417907715, + "BFComputeCutting": 0.0024290084838867188, + "BirCodeGenLoop": 0.32352304458618164, + "CCOpFusion": 0.033486366271972656, + "CanonicalizeDAGForPGTiling": 0.004197120666503906, + "CanonicalizeIR": 0.0025298595428466797, + "CoalesceCCOp": 0.007080078125, + "CommuteConcat": 0.0018744468688964844, + "DMALocalityOpt": 0.0021386146545410156, + "DMAProfiler": 0.01854729652404785, + "DMATilingProfiler": 0.015254497528076172, + "DataLocalityOpt": 0.1120154857635498, + "DataStreaming": 0.007681369781494141, + "DeConcat": 0.0022406578063964844, + "DeadCodeElimination": 0.0021486282348632813, + "DeadStoreElimination": 0.0063364505767822266, + "DelinearIndices": 0.0064697265625, + "Delinearization": 0.004486560821533203, + "DelinearizeSPMD": 0.01732611656188965, + "DoNothing": 9.441375732421875e-05, + "DramToDramTranspose": 0.02082037925720215, + "DumpGraphAndMetadata": 0.036411285400390625, + "EliminateDivs": 0.01006174087524414, + "ExpandBatchNorm": 0.0024886131286621094, + "ExpandISAMacro": 0.007379293441772461, + "FactorizeBlkDims": 0.023633480072021484, + "FactorizeThreadAxesInFreeDims": 0.0071103572845458984, + "FlattenMacroLoop": 0.009794235229492188, + "GenericAccessSimplifier": 0.0009224414825439453, + "InferInitValue": 0.12128233909606934, + "InferIntrinsicOnCC": 0.01005697250366211, + "InferNeuronTensor": 0.029047489166259766, + "InferNonlocalTensors": 0.017493009567260742, + "InferPSumTensor": 0.04303455352783203, + "InferShardAxis": 0.26027798652648926, + "InferSharedMemLoc": 0.012881040573120117, + "InlineNativeKernels": 0.002816915512084961, + "InsertCoreBarrier": 0.009889602661132813, + "InsertIOTransposes": 0.019797325134277344, + "InsertImplicitShardAxisBeforeISel": 0.05061173439025879, + "InsertLocalTransposes": 0.004299163818359375, + "InsertOffloadedTransposes": 0.008011579513549805, + "LICM": 0.009003639221191406, + "LateLegalizeInst": 0.013794183731079102, + "LateLegalizePostSplit": 0.013758182525634766, + "LateLowerReshapeOp": 0.0012693405151367188, + "LateLowerTensorOp": 0.002027750015258789, + "LateNeuronInstComb": 0.09844541549682617, + "LayoutPreprocessing": 0.025156497955322266, + "LayoutPreprocessingAndAnalysis": 0.06950831413269043, + "LayoutRequirementAnalysis": 0.0069408416748046875, + "LegalizeCCOpLayout": 0.003494739532470703, + "LegalizeOpLevelAlias": 0.0016810894012451172, + "LegalizePartitionReduce": 0.0026693344116210938, + "LegalizeSundaAccess": 0.0380399227142334, + "LegalizeSundaMacro": 0.10486245155334473, + "LegalizeType": 0.015400409698486328, + "LocalLayoutOpt": 0.012215137481689453, + "LoopFusion": 0.0049479007720947266, + "LoopSplitting": 0.0008144378662109375, + "LowerBroadcast": 0.0033435821533203125, + "LowerCCOpBlockAxis": 0.0037145614624023438, + "LowerComplexBroadcast": 0.0070230960845947266, + "LowerIntrinsics": 0.08174729347229004, + "LowerShardAxis": 0.020240068435668945, + "LowerTensorOp": 0.028459787368774414, + "LowerToSendRecv": 0.02129983901977539, + "LowerTranspose": 0.05583548545837402, + "MacroGeneration": 0.03631877899169922, + "MaskPropagation": 0.004620075225830078, + "MemcpyElimination": 0.04741477966308594, + "MutateDataType": 0.002264261245727539, + "NeuronAliasDependencyInduction": 0.002180337905883789, + "NeuronAliasDependencyReset": 0.08514618873596191, + "NeuronInstComb": 0.017351865768432617, + "NeuronLICM": 0.015241861343383789, + "NeuronLoopFusion": 0.05364656448364258, + "NeuronLoopInterchange": 0.002526521682739258, + "NeuronSimplifier": 0.06896662712097168, + "NeuronSimplifyPredicates": 0.023428916931152344, + "NeuronValueNumbering": 0.009569168090820313, + "OptimizeAliasedCopyChain": 0.0007548332214355469, + "OptimizeNKIKernels": 4.075549602508545, + "PAGLayoutOpt": 0.1111152172088623, + "PComputeCutting": 0.005707263946533203, + "PGLayoutTilingPipeline": 1.204958438873291, + "PGTiling": 0.4116194248199463, + "PadElimination": 0.0003600120544433594, + "ParAxesAnnotation": 0.050878286361694336, + "PartialLoopFusion": 0.0372469425201416, + "PartialSimdFusion": 0.021113157272338867, + "PerfectLoopNest": 0.007718086242675781, + "RecognizeOpIdiom": 0.0058002471923828125, + "Recompute": 0.0017511844635009766, + "RelaxPredicates": 0.00795745849609375, + "Rematerialization": 0.0019276142120361328, + "RemoveShardedPartitionAxes": 0.008410930633544922, + "ReshapeWeights": 0.0063934326171875, + "ResolveAccessConflict": 0.01411294937133789, + "ResolveComplicatePredicates": 0.004876375198364258, + "RewriteReplicationMatmul": 0.0017600059509277344, + "RewriteWeights": 0.004542827606201172, + "SFKVectorizer": 0.23946118354797363, + "ShardingPropagationAnalysis": 0.06259655952453613, + "SimpleAllReduceTiling": 0.004370212554931641, + "Simplifier": 0.0033507347106933594, + "SimplifyMacroPredicates": 0.056143999099731445, + "SimplifyNeuronTensor": 0.020067691802978516, + "SimplifySlice": 0.001861572265625, + "SimplifyTensor": 0.02954578399658203, + "SpillPSum": 0.03782367706298828, + "SplitAPUnionSets": 0.07312703132629395, + "SplitAccGrp": 0.002663135528564453, + "StaticProfiler": 0.02257680892944336, + "StaticTransposeLocalTensor": 0.003572225570678711, + "SundaISel": 0.10315561294555664, + "TCTransform": 0.0025663375854492188, + "TensorInitialization": 0.00860285758972168, + "TensorOpSimplifier": 0.008630037307739258, + "TensorOpTransform": 0.028581619262695313, + "TileCCOps": 0.00518488883972168, + "TilingProfiler": 0.023342609405517578, + "TransformConvOp": 0.008756637573242188, + "TritiumFusion": 0.13446974754333496, + "ValueNumbering": 0.003237485885620117, + "VectorizeDMA": 0.028183698654174805, + "VectorizeMatMult": 0.015199661254882813, + "WeightCoalescing": 0.0020062923431396484, + "ZeroSizeTensorElimination": 0.0001671314239501953 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 20773.0, + "StaticProfiler::AifUb": 131.73849487304688, + "StaticProfiler::ArithmeticIntensityTensorizer": 127.46285247802734, + "StaticProfiler::AverageDmaLength": 2400.2490234375, + "StaticProfiler::AverageFractalPeUtilization": 98.70232391357422, + "StaticProfiler::AveragePartitionUtilization": 94.02606201171875, + "StaticProfiler::AveragePeUtilization": 96.57791900634766, + "StaticProfiler::DDRTransferBytes": 361746464.0, + "StaticProfiler::InternalTransferBytes": 320526112.0, + "StaticProfiler::LoadExpanded": 84060.0, + "StaticProfiler::LocalizationEfficiency": 96.75444030761719, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.23246002197266, + "StaticProfiler::StoreExpanded": 1898.0, + "StaticProfiler::TotalDMAExpanded": 85958.0, + "StaticProfiler::TotalDynamicInstancesCount": 25131.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24680.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 10368.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 10147.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 642.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 92.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 1.8000000636675395e-05, + "CanonicalizeForTensorizer": 1.1000000085914508e-05, + "Canonicalizer": 0.0002209999947808683, + "HoistCompute": 3.999999989900971e-06, + "IdentifyCrossPassTensors": 1.2000000424450263e-05, + "MemcastMotion": 7.999999979801942e-06, + "PenguinizeFunctions": 9.000000318337698e-06, + "PruneFunctions": 1.2000000424450263e-05, + "RemoveOptimizationBarriers": 2.9000000722589903e-05, + "ScatterMotion": 3.099999958067201e-05, + "TensorizerLegalizationPass": 1.2000000424450263e-05, + "VerifySupportedOps": 9.999999747378752e-06, + "algsimp": 4.999999873689376e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.999999979801942e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.700000029813964e-05, + "config-lowering": 0.00010399999882793054, + "constant_folding": 7.999999979801942e-06, + "cse": 1.1000000085914508e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 2.499999936844688e-05, + "flatten-call-graph": 7.000000096013537e-06, + "fuse-send-recv": 1.8999999156221747e-05, + "hilo-conditional-to-select": 3.999999989900971e-06, + "hilo::LegalizeAlias": 3.999999989900971e-06, + "hilo::NeuronInstCombine": 5.2999999752501026e-05, + "hilo::NeuronOpFusion": 3.899999865097925e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 9.000000318337698e-06, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 1.8999999156221747e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 9.999999747378752e-06, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05, + "mlir::hlo::MhloToPyPenguin": 0.0009130000253207982, + "mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05, + "mlir::mhlo::LowerComplexPass": 0.0001250000059371814, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.1000000085914508e-05, + "neuron-hlo-verifier": 0.00036299999919719994, + "operand_upcaster": 1.4000000192027073e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0004330000083427876, + "replace-minimum-constant": 4.999999873689376e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 3.7000001611886546e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.000000096013537e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 53.940223693847656, + "HloMacCount": 3254779904.0, + "Traffic": 120680992.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 1.9999999949504854e-06, + "CanonicalizeForTensorizer": 1.2000000424450263e-05, + "Canonicalizer": 0.0003380000125616789, + "HoistCompute": 9.999999974752427e-07, + "IdentifyCrossPassTensors": 9.999999747378752e-06, + "MemcastMotion": 1.1000000085914508e-05, + "PenguinizeFunctions": 7.999999979801942e-06, + "PruneFunctions": 7.000000096013537e-06, + "RemoveOptimizationBarriers": 2.300000051036477e-05, + "ScatterMotion": 4.999999873689376e-06, + "TensorizerLegalizationPass": 6.000000212225132e-06, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 4.8000001697801054e-05, + "batchnorm_expander": 1.1000000085914508e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 1.9999999949504854e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.8000000636675395e-05, + "config-lowering": 3.600000127335079e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 1.4000000192027073e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.4000000192027073e-05, + "flatten-call-graph": 9.999999747378752e-06, + "fuse-send-recv": 1.4999999621068127e-05, + "hilo-conditional-to-select": 6.000000212225132e-06, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 1.2000000424450263e-05, + "hilo::NeuronOpFusion": 1.1000000085914508e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05, + "hilo::ScheduleFusion": 3.999999989900971e-06, + "hilo::SixtyFourHack": 3.899999865097925e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.004476999863982201, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 1.2999999853491317e-05, + "mlir::detail::OpToOpPassAdaptor": 2.9999999242136255e-05, + "mlir::hlo::MhloToPyPenguin": 0.004188999999314547, + "mlir::mhlo::LowerComplexExtraPass": 9.000000136438757e-05, + "mlir::mhlo::LowerComplexPass": 0.000155999994603917, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.8000000636675395e-05, + "neuron-hlo-verifier": 0.00033400001120753586, + "operand_upcaster": 1.1000000085914508e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0004529999860096723, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 3.7999998312443495e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 4.199999966658652e-05, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 14.666111946105957, + "HloMacCount": 2572550144.0, + "Traffic": 350815552.0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.006727457046508789, + "DMALocalityOpt": 0.009476661682128906, + "DMAProfiler": 0.006308317184448242, + "DataStreaming": 0.029163122177124023, + "DoNothing": 0.0004937648773193359, + "ExpandISAMacro": 0.006926536560058594, + "FactorizeBlkDims": 0.049018144607543945, + "InferPSumTensor": 0.049260616302490234, + "InferSharedMemLoc": 0.003329038619995117, + "InsertCoreBarrier": 0.0059740543365478516, + "LateLegalizeInst": 0.019405364990234375, + "LateNeuronInstComb": 0.04540205001831055, + "LegalizeSundaAccess": 0.046309709548950195, + "LegalizeType": 0.05346846580505371, + "LowerBroadcast": 0.015480279922485352, + "LowerIntrinsics": 0.007883310317993164, + "LowerTranspose": 0.010731220245361328, + "NeuronInstComb": 0.03727889060974121, + "NeuronLICM": 0.03041553497314453, + "NeuronSimplifyPredicates": 0.006567955017089844, + "NeuronValueNumbering": 0.015464067459106445, + "SFKVectorizer": 0.07225155830383301, + "SimpleAllReduceTiling": 0.006035804748535156, + "SimplifyNeuronTensor": 0.11353325843811035, + "SpillPSum": 0.0760800838470459, + "WeightCoalescing": 0.01406407356262207 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk0/graph.neff b/context_encoding_model/_tp0_bk0/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..e8dd34ee8fa2badd9cd021fabb7fd1d836f71e95 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98834cf4cd3214e9f9fc84530eed5ef31b01fda5919c60b959ca4a30bcb80d0c +size 1188864 diff --git a/context_encoding_model/_tp0_bk0/log-neuron-cc.txt b/context_encoding_model/_tp0_bk0/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..f432efb371edd3e81493fa6621a61aec0dd7bfe9 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/log-neuron-cc.txt @@ -0,0 +1,9197 @@ +2025-11-04T21:38:31Z INFO 8460 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:31Z INFO 8460 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:31Z INFO 8473 [root]: XLA detected +2025-11-04T21:38:31Z INFO 8473 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:31Z INFO 8473 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0 +2025-11-04T21:38:31Z INFO 8473 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:31Z INFO 8473 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:31Z INFO 8473 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:31Z INFO 8473 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 8738 + reshape 2000 22.89% ################################################################ + broadcast 1359 15.55% ########################################### + transpose 1044 11.95% ################################# + convert 1001 11.46% ################################ + constant 643 7.36% #################### + parameter 371 4.25% ########### + slice 349 3.99% ########### + add 285 3.26% ######### + multiply 256 2.93% ######## + dot 254 2.91% ######## + get-tuple-element 231 2.64% ####### + select 199 2.28% ###### + compare 174 1.99% ##### + call 146 1.67% #### + concatenate 116 1.33% ### + tuple 57 0.65% # + scatter 57 0.65% # + negate 56 0.64% # + all-reduce 56 0.64% # + divide 31 0.35% + custom-call 30 0.34% + iota 7 0.08% + gather 6 0.07% + all-gather 3 0.03% + reduce 3 0.03% + sine 1 0.01% + cosine 1 0.01% + power 1 0.01% + maximum 1 0.01% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 5608 + reshape 1477 26.34% ################################################################ + transpose 789 14.07% ################################## + convert 776 13.84% ################################# + constant 415 7.40% ################# + parameter 371 6.62% ################ + broadcast 322 5.74% ############# + dot 253 4.51% ########## + custom-call 175 3.12% ####### + multiply 171 3.05% ####### + add 171 3.05% ####### + get-tuple-element 119 2.12% ##### + slice 115 2.05% #### + concatenate 114 2.03% #### + select 86 1.53% ### + compare 60 1.07% ## + scatter 57 1.02% ## + negate 56 1.00% ## + all-reduce 56 1.00% ## + gather 6 0.11% + iota 5 0.09% + all-gather 3 0.05% + reduce 3 0.05% + pad 2 0.04% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +Potential split-points stats: #CC 59 #AR 56 #AG 3 #BN 0 nClamp 0 +ModuleSplitter initial partitioning... #parts 59 +ModuleSplitter initial partitioning... Done. + 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 57 58 +New disjoint wave: start 2 len 54 NumReps: 27 macs 87879057408 +First non-zero-mac/used part from the end is 58 +Not enough zero-mac parts. skip +ModuleSplitter initial partitioning... #parts 29 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element iota multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: IR signature: 1514fa5ba952331bd03900b1f763c1a3225c85e979711655dfc0cb05933559c0 for sg0000/HLOToTensorizer +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: IR signature: 6a6d37701c3f02960f2f0299c2ca09dd9deed842b3b070f1fb3ebf2b56a4f4aa for sg0001/HLOToTensorizer +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: IR signature: 3d2fb4c7ca9ea09ac827c47fc3e081605bbe83e714c1eb5b354efd28800ac986 for sg0002/HLOToTensorizer +2025-11-04T21:38:31Z INFO 8473 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:31Z INFO 8473 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:31Z INFO 8473 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:31Z INFO 8473 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:31Z INFO 8473 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:31Z INFO 8473 [job.Frontend.0]: Start model loading +2025-11-04T21:38:31Z INFO 8473 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:31Z INFO 8473 [job.Frontend.0]: Num jobs: 12 +2025-11-04T21:38:31Z USER 8473 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:31Z INFO 8473 [Tensorizer]: Max workers: 3 +2025-11-04T21:38:32Z INFO 8554 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-11-04T21:38:32Z INFO 8555 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-11-04T21:38:32Z INFO 8556 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-11-04T21:38:32Z INFO 8555 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.018 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.011 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.020 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.008 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.005 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.032 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.009 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.017 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.014 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.058 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.015 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.028 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.069 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.055 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.063 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.014 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.161 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.052 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.009 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.005 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.007 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.024 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.059 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.005 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.024 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.050 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.010 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.029 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.042 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.181 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.189 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.024 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.029 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.186 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.191 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.033 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.034 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.015 seconds +2025-11-04T21:38:32Z INFO 8555 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:32Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.112 seconds +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:32Z INFO 8556 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.011 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.050 seconds +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.042 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.047 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.054 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.018 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.018 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.045 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [Tensorizer]: After optimization: 39 statements +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-162 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6941 | hlo_id: 108 | , id = 162 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-178 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.7078 | hlo_id: 117 | , id = 178 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=524288 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 128) %'all_gather.1' = AllGatherOp-46 AllGather_add(bfloat16 (1024, 128) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 128), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 19 | , id = 46 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.016 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:33Z INFO 8556 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.019 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.013 seconds +2025-11-04T21:38:33Z INFO 8554 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.016 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.014 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.023 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.012 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.025 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.070 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.017 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.017 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.051 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.030 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.111 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.017 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.145 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.063 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.093 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 36 +total number of sharded dags: 9 + +total bytes transferred from input, output, non local tensors: 351868706 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 349747984 +% bytes transferred with 2x bandwidths: 99.40 + +NC0 FLOPs: 2573842146 +NC1 FLOPs: 2572524640 +% FLOPs sharded: 99.97 + + +Shard dim: 256, Number of dags: 5 +Matmuls sharded with this dim: + + +Shard dim: 2, Number of dags: 3 +Matmuls sharded with this dim: +[128,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [128,8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[128,2,8,128] @ [2,8,128,2(s),6,2,128] = [128,2(s),6,2,128] Number of occurrences: 2 + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[2,8,128] @ [2,8,128,75968(s)] = [75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.034 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.260 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [Tensorizer]: After optimization: 31 statements +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 600 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG75'), (10, 'AG77'), (20, 'AG76')] +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG81'), (0, 'AG78'), (17, 'AG80'), (19, 'AG79')] +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(3, 'AG85'), (21, 'AG84'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.037 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.015 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:34Z INFO 8556 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.342 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.016 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:34Z INFO 8555 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.431 seconds +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.036 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.021 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.062 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.038 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.036 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.412 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.020 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 34 +total number of sharded dags: 9 + +total bytes transferred from input, output, non local tensors: 15032326 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 14502400 +% bytes transferred with 2x bandwidths: 96.47 + +NC0 FLOPs: 875945987 +NC1 FLOPs: 805699584 +% FLOPs sharded: 95.80 + + +Shard dim: 2, Number of dags: 7 +Matmuls sharded with this dim: +[128,2(s),8,128] @ [2(s),8,128,2,2,128] = [128,2,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[128,2(s),8,128] @ [2(s),8,128,2,2,2,2,64] = [128,2,2,2,2,64] Number of occurrences: 1 +[128,2(s),8,128] @ [2(s),8,128,2,2,2,64] = [128,2,2,2,64] Number of occurrences: 1 +[128,2,2,2,128] @ [2,2,2,128,2(s),2,4,128] = [128,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 128, Number of dags: 1 +Matmuls sharded with this dim: + + + +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.021 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.205 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 96: matmul_128x128x512 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 24: simd128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 16: simd128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.094 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.028 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.032 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.154 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.036 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.312 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.023 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.048 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.027 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.029 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 724 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(21, 'AG105'), (20, 'AG106')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 725 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(21, 'AG105'), (20, 'AG106')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|N|(64, 2) is not sorted, index list (w/ AG ids): [(17, 'AG109'), (15, 'AG112')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 727 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(21, 'AG105'), (20, 'AG106')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 727 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(10, 'AG107'), (9, 'AG115'), (11, 'AG114'), (15, 'AG112'), (17, 'AG109')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|N|(64, 2) is not sorted, index list (w/ AG ids): [(17, 'AG109'), (15, 'AG112')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG105'), (20, 'AG106')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG107'), (9, 'AG115'), (11, 'AG114'), (19, 'AG116')] +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 481 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(4, 'AG120'), (14, 'AG119'), (9, 'AG115'), (11, 'AG114'), (12, 'AG113'), (16, 'AG118'), (18, 'AG117')] +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.048 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.163 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.012 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.251 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.036 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.031 seconds +2025-11-04T21:38:35Z INFO 8555 [sg0001/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.073 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.331 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.019 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.112 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: matmul_128x128x512 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: simd128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: simd128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma128x1024 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma1x128 +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.015 seconds +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:35Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.054 seconds +2025-11-04T21:38:35Z INFO 8554 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.034 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.036 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.031 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.789 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 64: matmul_128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 32: matmul_128x128x512 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 16: matmul_128x128x512 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 8: matmul_128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x512 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 4: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 4: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 4: matmul_128x128x256 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 4: generic_store128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 4: generic_store128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 2: simd128x64 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 2: indirect_load128x512 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingBottleneck]: 2: simd128x64 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.105 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.068 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.051 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.056 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.033 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 31 +total number of sharded dags: 7 + +total bytes transferred from input, output, non local tensors: 53634052 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 51380224 +% bytes transferred with 2x bandwidths: 95.80 + +NC0 FLOPs: 3293413379 +NC1 FLOPs: 3222405120 +% FLOPs sharded: 98.91 + + +Shard dim: 2, Number of dags: 7 +Matmuls sharded with this dim: +[128,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [128,8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[128,2(s),8,128] @ [2(s),8,128,2,2,128] = [128,2,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[128,2(s),8,128] @ [2(s),8,128,2,2,2,2,64] = [128,2,2,2,2,64] Number of occurrences: 1 +[128,2(s),8,128] @ [2(s),8,128,2,2,2,64] = [128,2,2,2,64] Number of occurrences: 1 +[128,2,2,2,128] @ [2,2,2,128,2(s),2,4,128] = [128,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 +[128,2,8,128] @ [2,8,128,2(s),6,2,128] = [128,2(s),6,2,128] Number of occurrences: 2 + + + +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.067 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.021 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.071 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.018 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.005 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.056 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.627 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.056 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/LICM]: LICM finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.017 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.017 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.017 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.121 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.056 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.069 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(20, 'AG114'), (4, 'AG116'), (19, 'AG115')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 708 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG114'), (4, 'AG116'), (19, 'AG115')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 709 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG114'), (4, 'AG116'), (19, 'AG115')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG118'), (2, 'AG120'), (11, 'AG119')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG124'), (0, 'AG121'), (6, 'AG123'), (8, 'AG122')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(20, 'AG114'), (19, 'AG115')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 712 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG114'), (19, 'AG115')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77|N|(64, 2) is not sorted, index list (w/ AG ids): [(16, 'AG125'), (14, 'AG126')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG114'), (19, 'AG115')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(4, 'AG116'), (3, 'AG129'), (7, 'AG128'), (14, 'AG126'), (16, 'AG125')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input75|N|(64, 2) is not sorted, index list (w/ AG ids): [(16, 'AG125'), (14, 'AG126')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG114'), (19, 'AG115')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG116'), (3, 'AG129'), (7, 'AG128'), (18, 'AG130')] +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 477 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(1, 'AG134'), (12, 'AG133'), (3, 'AG129'), (7, 'AG128'), (9, 'AG127'), (15, 'AG132'), (17, 'AG131')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 354 of IO tensor non_local bfloat16 %add.4(128, 2, 8, 128) is not sorted, index list (w/ AG ids): [(10, 'AG117'), (4, 'AG116')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 355 of IO tensor non_local bfloat16 %all_reduce.1(128, 2, 8, 128) is not sorted, index list (w/ AG ids): [(10, 'AG117'), (4, 'AG116')] +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 610 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate6|NHC|(1, 128, 2, 8, 128) is not sorted, index list (w/ AG ids): [(10, 'AG117'), (4, 'AG116')] +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.030 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.196 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/LICM]: LICM finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.020 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8555 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.103 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:36Z INFO 8556 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.305 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: matmul_128x128x512 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: matmul_128x128x512 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: matmul_128x128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: simd128x512 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: dma128x4096 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: transpose_128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: matmul_128x128x256 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: dma128x4096 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: generic_store128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: generic_store128x128 +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:36Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.085 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.024 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.141 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.018 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.528 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.027 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.047 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.054 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.036 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.012 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.024 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.017 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.011 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.059 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.107 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.022 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 96: matmul_128x128x512 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 64: matmul_128x128x128 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.077 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 32: matmul_128x128x512 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 24: simd128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 16: matmul_128x128x512 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 8: simd128x256 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 8: matmul_128x128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 8: softmax128x1x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 4: transpose_128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 4: transpose_128x128 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingBottleneck]: 4: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.039 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.018 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.010 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.017 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.050 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/LICM]: LICM finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.028 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.019 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.015 seconds +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.035 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.021 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.130 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.134 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.032 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.034 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.029 seconds +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8554 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.015 seconds +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:37Z INFO 8556 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:37Z INFO 8555 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.318 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 24: simd128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 8: simd128x256 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 8: matmul_128x128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 8: softmax128x1x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.037 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.027 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.046 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.056 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.123 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.045 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.056 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.018 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.024 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.041 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.031 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.078 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.049 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.098 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.038 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.034 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.106 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.052 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.082 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.027 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.105 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.034 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LICM]: LICM finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.119 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.043 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.079 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:38Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.026 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.038 seconds +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:38Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8556 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.037 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.048 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.042 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.023 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.075 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.020 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.032 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.043 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.044 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.032 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.056 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.057 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.078 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.239 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.042 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.011 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.023 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 76.679% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'930.1544'[i31_0,4i31_1_0_0+i31_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i31_0,i0.128+512i31_1_0_0+128i31_1_0_1,i2.16,i1.128] # id=1543, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_930 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 193.732us (300.000KiB, est bw: 1.586GB/s, 9.754% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 2, 37984) %'convert.59'[0,i31_0,i0.128+512i31_1_0_0+128i31_1_0_1] = store float32<1 x 128> TongaSB partitions[2] float32 (2, 297, 1, 128) %'947.1554'[i31_0,4i31_1_0_0+i31_1_0_1,0,i0.128] # id=1552, src_id=None, , instances=600 # dl = tensor_op_name: convert.59_pftranspose_947 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 2.962% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 12, 512) %'input365_local_1014'[i_shard_1210,i15_0_0_0_0,i15_0_0_0_1,i0.128,i3.12,i1.128+128i2.2+256p_1654] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 12, 2, 128) %'input365'[2i15_0_0_0_0+i15_0_0_0_1,p_1654,i_shard_1210,i0.128,i3.12,i2.2,i1.128] # id=1313, src_id=None, , instances=16 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.128, i2.2, i3.12]] -> [[i0.128];[i1.128, i2.2, i3.12]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 2.815% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input366_local_991'[i_shard_1210,i10_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input366'[i_shard_1210,i10_0_0_1,i0.128,i1.4096] # id=1304, src_id=None, , instances=12 # dl = tensor_op_name: _dot.197 | hlo_id: 52 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 2.815% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input368_local_1002'[i_shard_1210,i12_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input368'[i_shard_1210,i12_0_0_1,i0.128,i1.4096] # id=1307, src_id=None, , instances=12 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 6.372us (1.000MiB, est bw: 164.552GB/s, 0.321% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2048) %'934.1628'[i_shard_1210,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (128, 2048) %'add.9'[i0.128,i1.2048] # id=1514, src_id=None, , instances=2 # dl = tensor_op_name: add.9_pftranspose_934 | hlo_id: 27 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 6.372us (1.000MiB, est bw: 164.552GB/s, 0.321% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2048) %'938.1633'[i_shard_1214_1467,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (262144,) %'all_reduce.3-buffer-2030'[2048i0.128+i1.2048] # id=1525, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.3_pftranspose_938 | hlo_id: 66 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 4.329us (1.000MiB, est bw: 242.218GB/s, 0.218% of tot. time) for bfloat16<128 x 2048> non_local bfloat16 (262144,) %'dot.14-buffer-2028'[2048i0.128+i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2048) %1282[i_shard_1210,i0.128,i1.2048] # id=1318, src_id=None, , instances=2 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 4.329us (1.000MiB, est bw: 242.218GB/s, 0.218% of tot. time) for bfloat16<128 x 2048> non_local bfloat16 (128, 16, 128) %'convert.57'[i0.128,i2.2+2i3.4+8i4.2,i1.128] = store bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2, 1024) %'942.1533'[i_shard_1214_1467,i0.128,i4.2,i1.128+128i2.2+256i3.4] # id=1531, src_id=None, , instances=2 # dl = tensor_op_name: convert.57_pftranspose_942 | hlo_id: 75 | [[i0.128];[i1.128, i2.2, i3.4, i4.2]] -> [[i0.128];[i1.128, i2.2, i3.4, i4.2]] +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 2.793us (256.000 B, est bw: 0.092GB/s, 0.141% of tot. time) for uint8<1 x 128> TongaSB partitions[1] uint8 (2, 1, 128) %'select.7020.1745'[i51_0_1113,0,i0.128] = load uint8<1 x 128> uint8 (2, 128) %'scatter.1'[i51_0_1113,i0.128] # id=1407, src_id=None, , instances=2 # dl = tensor_op_name: _select.7020 | hlo_id: 166 | [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.019 seconds +2025-11-04T21:38:39Z INFO 8556 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.083 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.099 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.109 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.034 seconds +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.123 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.041 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.045 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.059 seconds +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.025 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.042 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.030 seconds +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8554 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8555 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.045 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.019 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.021 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.059 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.049 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.033 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.096 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.054 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.054 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.053 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.087 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.086 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.025 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.042 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.043 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.042 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.023 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.071 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.072 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.030 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.133 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.066 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.078 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.058 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.096 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.239 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.036 seconds +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8555 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.017 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2630) %4(init=0.0)[i0.32,i1.2374] = load float32<32 x 2374> float32 (32, 2374) %6[i0.32,i1.2374] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2374) %10[i0.32,i1.2374] = load float32<32 x 2374> float32 (1, 75968) %'inp'[i0.32,i1.2374] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 9.509% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:40Z INFO 8554 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 15.684% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input67_local_1253'[i106_0_0_1354,i32_0_0_0_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input67'[i32_0_0_0_0,i106_0_0_1354,i0.128,i1.4096] # id=1517, src_id=None, , instances=4 # dl = tensor_op_name: _dot.2 | hlo_id: 36 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 15.684% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input61_local_1294'[i122_0_0_0,i122_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input61'[i122_0_0_0,i122_0_0_1,i0.128,i1.4096] # id=1628, src_id=None, , instances=4 # dl = tensor_op_name: _dot.3 | hlo_id: 156 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 8.365% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'input62_local_1283'[i106_0_0_1354,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input62'[i106_0_0_1354,i0.128,i1.4096] # id=1621, src_id=None, , instances=2 # dl = tensor_op_name: _dot | hlo_id: 141 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 8.365% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'input65_local_1269'[i58_0_0_1345,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input65'[i58_0_0_1345,i0.128,i1.4096] # id=1565, src_id=None, , instances=2 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 6.372us (1.000MiB, est bw: 164.552GB/s, 5.124% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2, 1024) %'custom-call.177.1838'[i106_0_0_1354,i0.128,i3.2,i1.128+128i2.8] = load bfloat16<128 x 2048> non_local bfloat16 (2, 8, 128, 128) %'all_gather.1'[i3.2,i2.8,i0.128,i1.128] # id=1512, src_id=None, , instances=2 # dl = tensor_op_name: _custom-call.177 | hlo_id: 28 | [[i0.128];[i1.128, i2.8, i3.2]] -> [[i0.128];[i1.128, i2.8, i3.2]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 4.417us (256.000KiB, est bw: 59.349GB/s, 3.551% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output2'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[3] int32 (2, 2, 2, 128, 1) %'scatter.7201.2043'[i_shard_1420,i132_0,i132_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[1] bfloat16 (2, 128, 2, 2, 128) %'transpose.18'[i_shard_1420,i0.128,i132_0,i132_1,i1.128] # id=1645, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=8 # dl = tensor_op_name: _scatter.7201 | hlo_id: 196 | if i_shard_1420 == 0 [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 4.417us (256.000KiB, est bw: 59.349GB/s, 3.551% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output1'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[3] int32 (2, 2, 2, 128, 1) %'scatter.7149.2047'[i_shard_1420,i139_0,i139_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[1] bfloat16 (2, 128, 512) %'add.2'[i_shard_1420,i0.128,256i139_0+i1.128+128i139_1] # id=1651, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=8 # dl = tensor_op_name: _scatter.7149 | hlo_id: 181 | if i_shard_1420 == 0 [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 4.359us (512.000KiB, est bw: 120.288GB/s, 3.504% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[1] bfloat16 (2, 128, 8, 128) %'1167.1874'[i1_0_0,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> non_local bfloat16 (2, 8, 128, 128) %'all_gather.1'[i1_0_0,i2.8,i0.128,i1.128] # id=1784, src_id=None, , instances=2 # dl = tensor_op_name: all_gather.1_pftranspose_1167 | hlo_id: 19 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 4.262us (512.000 B, est bw: 0.120GB/s, 3.426% of tot. time) for float32<64 x 1> TongaSB partitions[1] float32 (2, 128, 1) %'input66_local_1259'[i_shard_1395,i0.64+64p_2051,0] = load bfloat16<64 x 1> {'CrossPassTensor': ''}bfloat16 (64, 2) %'input66'[i0.64,p_2051] # id=1523, src_id=None, , instances=4 # dl = tensor_op_name: _custom-call.178 | hlo_id: 44 | [[i0.64];[]] -> [[i0.64];[]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 3.355us (256.000KiB, est bw: 78.145GB/s, 2.697% of tot. time) for bfloat16<128 x 512> TongaSB partitions[1] bfloat16 (2, 128, 512) %'transpose.1_pftranspose_1162'[T_i12_0_1166,i0.128,i1.512] = indirect_load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (151936, 2, 512) %'input60'[i0.128,T_i12_0_1166,i1.512] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[0] int32 (128, 1) %'gather.41.1669'[i0.128,0] # id=1472, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=2 # dl = tensor_op_name: _gather.41 | hlo_id: 16 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.062 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.070 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.070 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.028 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.013 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.030 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.032 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.068 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.066 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.035 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.012 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.031 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.012 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.013 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.012 seconds +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.049 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.019 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.043 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.037 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.119 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.013 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.037 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.044 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.045 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:41Z INFO 8554 [Tensorizer]: BirCodeGen estimate #instances=444 in sg0000 +2025-11-04T21:38:41Z INFO 8554 [Tensorizer]: IR signature: 3891866412971a23304f83506f0431a79f73f6419a375d3bfda8c2563199f8a8 for nc00/sg0000/TensorizerBIR +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.076 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.066 seconds +2025-11-04T21:38:41Z INFO 8555 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.053 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8554 [Tensorizer]: BirCodeGen estimate #instances=444 in sg0000 +2025-11-04T21:38:41Z INFO 8554 [Tensorizer]: IR signature: 40f5410b735e03ea11ae3aa345dc3c3e9ae32a5608bdc2c4e99aa63bb368f523 for nc01/sg0000/TensorizerBIR +2025-11-04T21:38:41Z INFO 8554 [Tensorizer]: Weights total number of bytes: 102656 +2025-11-04T21:38:41Z INFO 8554 [Tensorizer]: Successfully built model. +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.030 seconds +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:41Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.049 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.046 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.218 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.518 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 19.661% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 12, 512) %'input68_local_1203'[i_shard_1397,i15_0_0_0_0,i15_0_0_0_1,i0.128,i3.12,i1.128+128i2.2+256p_1775] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 12, 2, 128) %'input68'[2i15_0_0_0_0+i15_0_0_0_1,p_1775,i_shard_1397,i0.128,i3.12,i2.2,i1.128] # id=1502, src_id=None, , instances=16 # dl = tensor_op_name: _dot.6 | hlo_id: 53 | [[i0.128];[i1.128, i2.2, i3.12]] -> [[i0.128];[i1.128, i2.2, i3.12]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 18.690% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input69_local_1180'[i_shard_1397,i10_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input69'[i_shard_1397,i10_0_0_1,i0.128,i1.4096] # id=1493, src_id=None, , instances=12 # dl = tensor_op_name: _dot.4 | hlo_id: 42 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 18.690% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input71_local_1191'[i_shard_1397,i12_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input71'[i_shard_1397,i12_0_0_1,i0.128,i1.4096] # id=1496, src_id=None, , instances=12 # dl = tensor_op_name: _dot.5 | hlo_id: 32 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 6.520% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input78_local_1226'[i99_0_0_1359,i25_0_0_0_0,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input78'[i25_0_0_0_0,i99_0_0_1359,i0.128,i1.4096] # id=1518, src_id=None, , instances=4 # dl = tensor_op_name: _dot.9 | hlo_id: 73 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 6.520% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input72_local_1287'[i115_0_0_0,i115_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input72'[i115_0_0_0,i115_0_0_1,i0.128,i1.4096] # id=1580, src_id=None, , instances=4 # dl = tensor_op_name: _dot.10 | hlo_id: 180 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 3.477% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'input73_local_1276'[i99_0_0_1359,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input73'[i99_0_0_1359,i0.128,i1.4096] # id=1573, src_id=None, , instances=2 # dl = tensor_op_name: _dot.7 | hlo_id: 165 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 3.477% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'input76_local_1242'[i51_0_0_1350,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input76'[i51_0_0_1350,i0.128,i1.4096] # id=1542, src_id=None, , instances=2 # dl = tensor_op_name: _dot.8 | hlo_id: 112 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 6.372us (1.000MiB, est bw: 164.552GB/s, 2.130% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2048) %'1123.1743'[i_shard_1397,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (128, 2048) %'add.4'[i0.128,i1.2048] # id=1625, src_id=None, , instances=2 # dl = tensor_op_name: add.4_pftranspose_1123 | hlo_id: 17 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 6.372us (1.000MiB, est bw: 164.552GB/s, 2.130% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 2048) %'1127.1748'[i99_0_0_1359,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (262144,) %'all_reduce.1-buffer-2223'[2048i0.128+i1.2048] # id=1636, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.1_pftranspose_1127 | hlo_id: 56 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 4.417us (256.000KiB, est bw: 59.349GB/s, 1.476% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output4'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[3] int32 (2, 2, 2, 128, 1) %'scatter.7303.1796'[i_shard_1434,i125_0,i125_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[1] bfloat16 (2, 128, 2, 2, 128) %'transpose.43'[i_shard_1434,i0.128,i125_0,i125_1,i1.128] # id=1595, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=8 # dl = tensor_op_name: _scatter.7303 | hlo_id: 214 | if i_shard_1434 == 0 [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.021 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.114 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.009 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.048 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.029 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.054 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.072 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.039 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.019 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.009 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 272) %4(init=0.0)[i0.32,i1.16] = load float32<32 x 16> float32 (32, 16) %6[i0.32,i1.16] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 16) %10[i0.32,i1.16] = load float32<32 x 16> float32 (1, 512) %'inp'[i0.32,i1.16] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 12.028% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.034 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.034 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.043 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8555 [Tensorizer]: BirCodeGen estimate #instances=946 in sg0001 +2025-11-04T21:38:42Z INFO 8555 [Tensorizer]: IR signature: 1a64fb9a9789d86ec8f86155888cf995796402cb4889914d5a227e5f57138360 for nc00/sg0001/TensorizerBIR +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8555 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:42Z INFO 8556 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8555 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.047 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8555 [Tensorizer]: BirCodeGen estimate #instances=946 in sg0001 +2025-11-04T21:38:43Z INFO 8555 [Tensorizer]: IR signature: f8f128c1cfe54fd13b017057632a6327d18dfd0c7ab0be36030aeed744a14d8a for nc01/sg0001/TensorizerBIR +2025-11-04T21:38:43Z INFO 8555 [Tensorizer]: Weights total number of bytes: 102400 +2025-11-04T21:38:43Z INFO 8555 [Tensorizer]: Successfully built model. +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 4.076 seconds +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.045 seconds +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:43Z WARNING 8556 [sg0002/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 88.25 percent of all matmul computation +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.023 seconds +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:43Z INFO 8556 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.073 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.014 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.013 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.020 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.033 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.033 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.036 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.021 seconds +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.264 seconds +2025-11-04T21:38:44Z INFO 8556 [Tensorizer]: BirCodeGen estimate #instances=24906 in sg0002 +2025-11-04T21:38:44Z INFO 8556 [Tensorizer]: IR signature: 409516c5e6b4ff66271ff48e4eb89c20f462fc0b125361c9946ca1531e07e72b for nc00/sg0002/TensorizerBIR +2025-11-04T21:38:44Z INFO 8556 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:45Z INFO 8556 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8556 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.324 seconds +2025-11-04T21:38:45Z INFO 8556 [Tensorizer]: BirCodeGen estimate #instances=24906 in sg0002 +2025-11-04T21:38:45Z INFO 8556 [Tensorizer]: IR signature: bbbd7e5d9d760889c2a1fcc1c50ef48c2fb82aeb68b611834173bb23c619c841 for nc01/sg0002/TensorizerBIR +2025-11-04T21:38:45Z INFO 8556 [Tensorizer]: Weights total number of bytes: 410376 +2025-11-04T21:38:45Z INFO 8556 [Tensorizer]: Successfully built model. +2025-11-04T21:38:45Z USER 8473 [root/Tensorizer/Tensorizer]: Tensorizer finished after 13.280 seconds +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: End tensorization +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input60 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input0 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input63 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input67 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input66 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input65 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input64 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input62 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input61 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input4 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input5 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input70 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input71 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input69 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input68 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input74 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input78 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input77 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input76 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input75 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input73 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input72 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input6 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input7 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input367 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input368 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input366 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input365 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input370 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input369 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Network input: input3 +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:45Z INFO 8473 [job.Frontend.0]: Job #0 finished +2025-11-04T21:38:45Z INFO 8473 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:38:45Z INFO 8473 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:38:45Z INFO 8473 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:38:45Z INFO 8473 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: BackendDriver has 6 states with 2 core LNC +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: BackendDriver: found partitions within VNC, using VNC + MT modular flow. +2025-11-04T21:38:45Z INFO 8473 [job.BIRLinker.1]: Creating directory nc00/sgLnk/sg00 +2025-11-04T21:38:45Z INFO 8473 [job.BIRLinker.2]: Creating directory nc01/sgLnk/sg00 +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: BackendDriver in_state.num_states 6 with 2 core LNC +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs nc00/sg00,nc01/sg00,nc00/sg01,nc01/sg01,nc00/sg02,nc01/sg02 --link-dir sgLnk/sg00 --vnc-nc-per-sengine 2 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels spill_reload,io,vector_dynamic_offsets,scalar_dynamic_offset --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:38:45Z INFO 8473 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:38:45Z INFO 9015 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Loading module from nc01/sg01/bir.json +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Loading module from nc00/sg02/bir.json +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Loading module from nc01/sg02/bir.json +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Loading module from nc00/sg01/bir.json +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Backend driver mtBackend: true numModules: 6 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc" +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Modular flow call graph is enabled +2025-11-04T21:38:45Z INFO 9015 [BackendDriver]: Internal partitioner is enabled +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1886 blocks=6 instructions=1960 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 82mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 82mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 205 memory location(s), 1 block(s), and 168 instruction(s). Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z WARNING 9015 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.282.2073}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 205 memory location(s), 1 block(s), and 168 instruction(s). Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 150 memory location(s), 1 block(s), and 104 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.282.2073}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 150 memory location(s), 1 block(s), and 104 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 205 memory location(s), 1 block(s), and 168 instruction(s). Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 205 memory location(s), 1 block(s), and 168 instruction(s). Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 150 memory location(s), 1 block(s), and 104 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.010 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 83mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 150 memory location(s), 1 block(s), and 104 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.108 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 155mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.124 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.133 seconds +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=1886 blocks=6 instructions=1960 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:45Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=410 blocks=2 instructions=336 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:45Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=300 blocks=2 instructions=208 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z USER 9015 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:45Z USER 9015 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:45Z USER 9015 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:45Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 300 memory location(s), 2 block(s), and 208 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 410 memory location(s), 2 block(s), and 336 instruction(s). Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1176 blocks=2 instructions=1416 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1176 memory location(s), 2 block(s), and 1416 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1886 blocks=6 instructions=1960 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 205 memory location(s), 1 block(s), and 168 instruction(s). Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 205 memory location(s), 1 block(s), and 168 instruction(s). Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=205 blocks=1 instructions=168 Max writers: 4 Max Readers: 10 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:45 2025 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:45 2025 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 150 memory location(s), 1 block(s), and 104 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 150 memory location(s), 1 block(s), and 104 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=150 blocks=1 instructions=104 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:45 2025 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:45 2025 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 160mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:45 2025 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:45 2025 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:45 2025 + +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Total count: 412 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Matmult: 196 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: TensorTensor: 50 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: GenericCopy: 44 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Load: 20 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Activation: 19 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Save: 6 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: CollectiveCompute: 5 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: TensorReduce: 4 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: TensorScalarAffineSelect: 2 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Reciprocal: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: DMACopy: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Iota: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 182mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 627 memory location(s), 1 block(s), and 412 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=627 blocks=1 instructions=412 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 184mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:45 2025 + +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Total count: 442 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Matmult: 196 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: TensorScalarPtr: 69 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: TensorTensor: 50 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: GenericCopy: 44 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Load: 20 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Activation: 19 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: DMACopy: 10 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Save: 7 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: CollectiveCompute: 5 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: TensorReduce: 4 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: TensorScalarAffineSelect: 2 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Reciprocal: 1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Iota: 1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 9 +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.022 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 627 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=627 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 191mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:45 2025 + +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:45 2025 + +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Total count: 910 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Matmult: 665 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: GenericCopy: 51 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: TensorScalarPtr: 50 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Load: 43 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Activation: 40 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: TensorTensor: 34 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: CollectiveCompute: 6 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: TensorReduce: 4 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Select: 3 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Save: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Reciprocal: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: CoreBarrier: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: DMACopy: 1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 0 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Total count: 942 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Matmult: 665 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: TensorScalarPtr: 66 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: GenericCopy: 51 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Load: 43 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Activation: 40 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: TensorTensor: 34 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: DMACopy: 10 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: CollectiveCompute: 6 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Save: 4 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: TensorReduce: 4 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Select: 3 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Reciprocal: 1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: CoreBarrier: 1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 8 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: unroll finished after 0.035 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 193mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 636 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=636 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: unroll finished after 0.034 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 191mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 636 memory location(s), 1 block(s), and 910 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=636 blocks=1 instructions=910 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 192mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.004 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 193mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:45 2025 + +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Total count: 13087 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Matmult: 10442 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: GenericCopy: 1410 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Load: 356 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Save: 320 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: TensorTensor: 29 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Activation: 25 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Memset: 22 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: TensorReduce: 8 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: unroll finished after 0.225 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 327mb, ru_maxrss: 327mb (delta=115mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:45 2025 + +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Total count: 13075 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Matmult: 10442 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: GenericCopy: 1410 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Load: 356 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Save: 308 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: TensorTensor: 29 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Activation: 25 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Memset: 22 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: TensorReduce: 8 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: unroll finished after 0.232 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=115mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5180 memory location(s), 1 block(s), and 13087 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5180 memory location(s), 1 block(s), and 13075 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5180 blocks=1 instructions=13087 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5180 blocks=1 instructions=13075 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.028 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.028 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.264 seconds +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=115mb) +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6196 blocks=6 instructions=28024 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:45Z USER 9015 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:45Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=478 blocks=2 instructions=825 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=555 blocks=2 instructions=1827 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z USER 9015 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:45Z USER 9015 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z USER 9015 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 478 memory location(s), 2 block(s), and 825 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 555 memory location(s), 2 block(s), and 1827 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5163 blocks=2 instructions=25372 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5163 memory location(s), 2 block(s), and 25372 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:45Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6196 blocks=6 instructions=28024 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:45Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1474_i1}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {_dot.199-t1282_i1}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.199||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:45Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1192_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:45Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.7_1197_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:45Z USER 9015 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.002 seconds +2025-11-04T21:38:45Z USER 9015 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:45Z USER 9015 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:45Z USER 9015 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 251mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:45Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.037 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.037 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:46Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.038 seconds +2025-11-04T21:38:46Z INFO 9015 [BackendPassManager]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:46Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6196 blocks=6 instructions=28024 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:46Z USER 9015 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:46Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=478 blocks=2 instructions=825 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=555 blocks=2 instructions=1827 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:46Z USER 9015 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 478 memory location(s), 2 block(s), and 825 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 555 memory location(s), 2 block(s), and 1827 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=5163 blocks=2 instructions=25372 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5163 memory location(s), 2 block(s), and 25372 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:46Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 9015 [BackendPassManager]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:46Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6196 blocks=6 instructions=28024 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:46Z WARNING 9015 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z WARNING 9015 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 254mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z WARNING 9015 (nc00/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 32 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z WARNING 9015 (nc01/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: psum_legalization finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: constant_propagate finished after 0.003 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Found 4 Splits CCs +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: Grouped CCs to 4 clusters. +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Found 4 Splits CCs +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: Grouped CCs to 4 clusters. +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: instruction_reorder finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Found 6 Splits CCs +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: Grouped CCs to 6 clusters. +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: End DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Allocs: 252 instructions: 431 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Allocs: 292 instructions: 942 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: constant_propagate finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 929 edges +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Done build fdeps 929 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Found 6 Splits CCs +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: Grouped CCs to 6 clusters. +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 252 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=252 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=253 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Allocs: 226 instructions: 394 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 834 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Done build fdeps 834 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: size = 59 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 255mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: found 42 edges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: mean: 1.42373 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: median: 0.313439 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 336 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: lo = 59 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: total = 59 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z WARNING 9015 (nc01/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.002 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 2457 edges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Done build fdeps 2457 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 18 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 17 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 226 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.001 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=226 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7118596 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3701 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 696322 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 905 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 397824 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: pre_sched finished after 0.009 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=292 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: size = 165 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: found 31 accumulation groups +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: largest = _dot.3-t1360_i1 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: tensors = 3 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: requires 10240 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: size = 59 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: found 42 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: mean: 1.42373 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: median: 0.313439 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 336 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: lo = 59 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: total = 59 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: End DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 18 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 17 PSUM Banks +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Allocs: 263 instructions: 885 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 7116544 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4271 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 696320 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 906 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 131584 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 514 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: size = 141 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: found 31 accumulation groups +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: largest = _dot.3-t1360_i3 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: tensors = 3 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: requires 10240 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: 12 remat count +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: 11 remat count +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Num intervals 165 Num locations 165 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Num intervals 141 Num locations 141 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: edge: 1244 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: edge: 1445 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: mean: 17.6454 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: median: 14.5847 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: mean: 17.5152 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: median: 13.4803 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: safe = 139 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: unsafe = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: total = 139 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 141 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: safe = 163 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: unsafe = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: total = 163 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 165 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Total: 139 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (139) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Rover zone: 0.986 (137) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.007 (1) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.007 (1) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Blocks tall: 1.000 (139) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.990 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 291 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=291 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Total: 163 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (163) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Rover zone: 0.982 (160) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.012 (2) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.006 (1) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.006 (1) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.994 (162) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.965 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 292 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=292 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7116544 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4271 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 696320 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 906 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 131584 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 514 bytes +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 7118596 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3701 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 696322 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 905 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 397824 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 228 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=228 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 7812864, 81.0217% input load, 3.87955% output write, 15.0988% spill/reload [sg0000] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 254 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=254 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 7814918, 81.0266% input load, 3.87856% output write, 15.0948% spill/reload [sg0000] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: size = 73 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(6.33216e+06) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: found 69 edges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: mean: 1.89041 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: median: 0.977612 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: adjacency vectors require 552 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(6.33011e+06) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 2310 edges +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: vn_splitter finished after 0.010 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Done build fdeps 2310 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: lo = 73 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: total = 73 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 3701 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 905 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 4271 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7118596 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3701 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 906 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 7116544 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 696322 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4271 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 905 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 696320 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 906 bytes +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 7812864, 81.0217% input load, 3.87955% output write, 15.0988% spill/reload [sg0000] +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7116544 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4271 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 696320 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 906 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 7814918, 81.0266% input load, 3.87856% output write, 15.0948% spill/reload [sg0000] +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 131584 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 514 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2953 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 7118596 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3701 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 696322 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 905 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 397824 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 172 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1640 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=253 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: pre_sched finished after 0.015 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 33 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 885 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=263 blocks=1 instructions=885 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 12 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 8 PSUM Banks +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 26407428 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5156 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1310722 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3404 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: size = 186 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: found 65 accumulation groups +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: largest = _dot.6-t1328_i2 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: tensors = 7 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: requires 15360 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 262 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=262 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 263 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=263 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: size = 71 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: found 68 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: mean: 1.91549 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: median: 0.96812 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: adjacency vectors require 544 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: lo = 71 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: total = 71 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 33 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 11 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 8 PSUM Banks +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: 28 remat count +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 26403328 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5428 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 262144 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2048 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Num intervals 186 Num locations 186 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: size = 161 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: edge: 1994 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: mean: 21.4409 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: median: 17.2962 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: found 63 accumulation groups +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: largest = _dot.6-t1328_i5 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: tensors = 7 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: requires 15360 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.029 seconds +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: safe = 177 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: unsafe = 6 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: inf = 1 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: total = 184 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 186 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Total: 184 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Allocated: 1.000 (184) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Rover zone: 0.984 (181) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Pre-rover zone: 0.005 (1) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Post-rover zone: 0.011 (2) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Blocks nothing: 0.005 (1) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Blocks tall: 0.995 (183) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.976 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: Success +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: 27 remat count +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Num intervals 161 Num locations 161 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 26407428 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5156 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1310722 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3404 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.010 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 293 memory location(s), 1 block(s), and 941 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=293 blocks=1 instructions=941 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 27718150, 91.4883% input load, 1.8915% output write, 6.62024% spill/reload [sg0001] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z WARNING 9015 (nc00/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: edge: 1748 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: mean: 21.7143 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: median: 17.4777 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 7 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: safe = 152 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: unsafe = 6 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: inf = 1 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: total = 159 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 161 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Total: 159 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Allocated: 1.000 (159) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Rover zone: 0.981 (156) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Pre-rover zone: 0.006 (1) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Post-rover zone: 0.013 (2) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Blocks tall: 1.000 (159) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: Success +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 65536, 0.236437% out of total dma traffic(2.53589e+07) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 26403328 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5428 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 262144 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2048 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 5414 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 3404 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 26341892 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5414 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1310722 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3404 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 264 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=264 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 26665472, 95.0846% input load, 0% output write, 4.91542% spill/reload [sg0001] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 65536, 0.236437% out of total dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 27652614, 91.4681% input load, 1.89599% output write, 6.63593% spill/reload [sg0001] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 26341892 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5414 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1310722 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3404 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 266240 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3825 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 939 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=290 blocks=1 instructions=939 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.021 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: reserved space = 33024 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 10 out of 51 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 394, number of allocs: 227 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 4.5e-05 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 65536, 0.245771% out of total dma traffic(2.53548e+07) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Allocs: 227 instructions: 394 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 834 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [build_flow_deps]: Done build fdeps 834 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:46Z INFO 9015 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.029 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=253 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.006 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: vn_splitter finished after 0.014 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: reserved space = 37120 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=253 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 5715 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 2048 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 524288 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 524288 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.005 seconds +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=253 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=227 blocks=1 instructions=394 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 10 out of 52 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 431 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=253 blocks=1 instructions=431 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 26337792 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5715 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 262144 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2048 bytes +2025-11-04T21:38:46Z USER 9015 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 227 memory location(s), 1 block(s), and 394 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 65536, 0.245771% out of total dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 26599936, 95.0725% input load, 0% output write, 4.92753% spill/reload [sg0001] +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 26337792 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5715 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 262144 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2048 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 5616 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.016 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 882 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=261 blocks=1 instructions=882 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 432, number of allocs: 253 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.000238 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 20 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.019 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 939 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=290 blocks=1 instructions=939 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: reserved space = 36864 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Allocs: 253 instructions: 432 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 939 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=290 blocks=1 instructions=939 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: allreduce hwm 524288 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: Real CC buffer size 524288 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 939 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=290 blocks=1 instructions=939 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyAccel::Impl]: Accelerated 12 out of 60 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 939 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=290 blocks=1 instructions=939 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 942, number of allocs: 290 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [LowerKernel]: Scan BKs time (s): 6.7e-05 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 933 edges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [build_flow_deps]: Done build fdeps 933 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.003 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 7Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Allocs: 290 instructions: 942 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=253 blocks=1 instructions=432 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: constant_propagate finished after 0.053 seconds +2025-11-04T21:38:46Z USER 9015 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 253 memory location(s), 1 block(s), and 432 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: lower_ac finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 2470 edges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [build_flow_deps]: Done build fdeps 2470 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: build_fdeps finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: remove_redundancies finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.023 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 882 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=261 blocks=1 instructions=882 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 882 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=261 blocks=1 instructions=882 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: allreduce hwm 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: Real CC buffer size 524288 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 882 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=261 blocks=1 instructions=882 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyAccel::Impl]: Accelerated 10 out of 57 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 882 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=261 blocks=1 instructions=882 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: peephole_opts finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 884, number of allocs: 261 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [LowerKernel]: Scan BKs time (s): 9.1e-05 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: lower_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1474_i1}@SB<0,54280>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 8Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Allocs: 261 instructions: 884 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.014 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=290 blocks=1 instructions=942 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 2319 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [build_flow_deps]: Done build fdeps 2319 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: build_fdeps finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: remove_redundancies finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:46Z USER 9015 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 290 memory location(s), 1 block(s), and 942 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: remat_optimization finished after 0.013 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2357 memory location(s), 1 block(s), and 12295 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2357 blocks=1 instructions=12295 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=261 blocks=1 instructions=884 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:46Z USER 9015 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 261 memory location(s), 1 block(s), and 884 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: constant_propagate finished after 0.066 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: lower_ac finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: End DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: remat_optimization finished after 0.010 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 257mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2806 memory location(s), 1 block(s), and 13077 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2806 blocks=1 instructions=13077 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Allocs: 2359 instructions: 12297 +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:46Z INFO 9015 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 31848 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Done build fdeps 31848 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: End DCE Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: pre_sched finished after 0.092 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 259mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2359 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 10Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [build_flow_deps]: Allocs: 2808 instructions: 13075 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.028 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 259mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2359 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 259mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2360 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2360 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 259mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 259mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 259mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: size = 1008 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: 50% PSUM demand before spilling +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: PSUM high-water mark = 4 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: found 986 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: mean: 1.95635 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: median: 1.23155 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: adjacency vectors require 7888 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 43764 edges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [build_flow_deps]: Done build fdeps 43764 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: lo = 934 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: total = 1008 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PSUM_Allocator]: 50% PSUM utilization after allocation +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.094 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: pre_sched finished after 0.197 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2808 memory location(s), 1 block(s), and 13075 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2808 blocks=1 instructions=13075 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.029 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 63 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 422 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.060 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2745 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2746 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2746 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 12 PSUM Banks +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: size = 1132 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.073 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 175550866 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4248 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 677632 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1218 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: 50% PSUM demand before spilling +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: PSUM high-water mark = 4 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: found 1048 edges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: mean: 1.85159 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: median: 1.14234 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: adjacency vectors require 8384 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: size = 1316 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: found 1003 accumulation groups +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: largest = _dot.199-t1126_i7 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: tensors = 7 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: requires 15360 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: lo = 1058 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: total = 1132 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [PSUM_Allocator]: 50% PSUM utilization after allocation +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.049 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: 326 remat count +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.017 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Num intervals 1316 Num locations 1316 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: edge: 7406 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: mean: 11.2553 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: median: 5.31293 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: safe = 1305 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: unsafe = 8 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: inf = 1 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: total = 1314 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1316 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Total: 1314 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Allocated: 1.000 (1314) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Rover zone: 0.986 (1295) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Pre-rover zone: 0.011 (14) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Post-rover zone: 0.004 (5) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Blocks nothing: 0.017 (22) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Blocks medium: 0.002 (2) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.716 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.714 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.714 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Blocks tall: 0.982 (1290) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.756 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.996 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: Success +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 484 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 175550866 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4248 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 677632 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1218 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.070 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 12 PSUM Banks +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.018 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2361 memory location(s), 1 block(s), and 12297 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2361 blocks=1 instructions=12297 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 176228498, 99.0185% input load, 0% output write, 0.981493% spill/reload [sg0002] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.048 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 176186270 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4203 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1217291 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1127 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: size = 1568 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: found 1127 accumulation groups +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: largest = _dot.199-t1126_i3 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: tensors = 7 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: requires 15360 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.74499e+08) +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: 336 remat count +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Num intervals 1568 Num locations 1568 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: edge: 8884 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: mean: 11.3316 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: median: 5.51822 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: safe = 1555 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: unsafe = 10 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: inf = 1 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: total = 1566 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1568 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Total: 1566 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Allocated: 1.000 (1566) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Rover zone: 0.961 (1505) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Pre-rover zone: 0.017 (27) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Post-rover zone: 0.019 (30) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Slice zone: 0.003 (4) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Blocks nothing: 0.072 (112) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Blocks medium: 0.021 (33) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.597 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.612 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.831 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Blocks tall: 0.907 (1421) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.672 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.952 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: Success +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: average loaded DMA size 4261 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: average saved DMA size 1582 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 175550610 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4261 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 677376 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1582 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.029601% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 512, 0.000290532% out of total dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 176227986, 99.0188% input load, 0% output write, 0.981205% spill/reload [sg0002] +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 175550610 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4261 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 677376 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1582 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4231 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.100 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12296 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2359 blocks=1 instructions=12296 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 173 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 176186270 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4203 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1217291 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1127 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.193 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.013 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2747 memory location(s), 1 block(s), and 13012 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2747 blocks=1 instructions=13012 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 177403561, 98.5409% input load, 2.25475e-06% output write, 1.45914% spill/reload [sg0002] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.74815e+08) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.183 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12296 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2359 blocks=1 instructions=12296 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: spill space = 524288 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 524288 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: size = 1 +2025-11-04T21:38:46Z INFO 9015 []: find first defs for local +2025-11-04T21:38:46Z INFO 9015 []: find first defs for global +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: Num intervals 1 Num locations 1 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: lo = 1 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: total = 1 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.025 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12296 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2359 blocks=1 instructions=12296 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: average loaded DMA size 4216 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: average saved DMA size 1278 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 176186014 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4216 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1217035 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1278 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: allreduce hwm 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: Real CC buffer size 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 524288 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12296 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2359 blocks=1 instructions=12296 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.0197793% out of total spill/reload dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 512, 0.000288608% out of total dma traffic +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 177403049, 98.5411% input load, 2.25475e-06% output write, 1.45885% spill/reload [sg0002] +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 176186014 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4216 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [TensorCopyAccel::Impl]: Accelerated 599 out of 1221 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12296 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1217035 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1278 bytes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2359 blocks=1 instructions=12296 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4149 bytes +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.114 seconds +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13011 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2745 blocks=1 instructions=13011 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: peephole_opts finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 12299, number of allocs: 2359 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [LowerKernel]: Scan BKs time (s): 0.001354 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [LowerKernel]: Lower BKs time (s): 3e-06 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 267mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {_dot.199-t1282_i1}@SB<0,54280>(128x4096)#Internal DebugInfo: <_dot.199||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:46Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1192_i1}@SB<32,16384>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:46Z WARNING 9015 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.7_1197_i1}@SB<96,17536>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.025 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 268mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 186 Sb address +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 268mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 11Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Allocs: 2359 instructions: 12299 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 31850 edges +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [build_flow_deps]: Done build fdeps 31850 Tue Nov 4 21:38:46 2025 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: build_fdeps finished after 0.051 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 268mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: remove_redundancies finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 268mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:46Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.066 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.020 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2359 blocks=1 instructions=12299 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.014 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2359 memory location(s), 1 block(s), and 12299 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 25 Sb address +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 179 Sb address +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.285 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13011 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2745 blocks=1 instructions=13011 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: reserved space = 34824 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: spill space = 531460 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 552960 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: size = 8 +2025-11-04T21:38:47Z INFO 9015 []: find first defs for local +2025-11-04T21:38:47Z INFO 9015 []: find first defs for global +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: lo = 8 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: total = 8 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 524288 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.024 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13011 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2745 blocks=1 instructions=13011 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 524288 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: allreduce hwm 524288 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: Real CC buffer size 524288 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 524288 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.011 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13011 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2745 blocks=1 instructions=13011 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [TensorCopyAccel::Impl]: Accelerated 599 out of 1360 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13011 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2745 blocks=1 instructions=13011 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: peephole_opts finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 13014, number of allocs: 2745 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [LowerKernel]: Scan BKs time (s): 0.000608 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.012 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.037 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 12Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [build_flow_deps]: Allocs: 2745 instructions: 13014 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 43703 edges +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [build_flow_deps]: Done build fdeps 43703 Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: build_fdeps finished after 0.055 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: remove_redundancies finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.076 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2745 blocks=1 instructions=13014 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.008 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2745 memory location(s), 1 block(s), and 13014 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 1.402 seconds +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6135 blocks=6 instructions=27965 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=480 blocks=2 instructions=826 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=551 blocks=2 instructions=1826 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 480 memory location(s), 2 block(s), and 826 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=480 blocks=2 instructions=826 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 551 memory location(s), 2 block(s), and 1826 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=551 blocks=2 instructions=1826 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 559 memory location(s), 2 block(s), and 1846 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=559 blocks=2 instructions=1846 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5104 blocks=2 instructions=25313 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.003 seconds +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.002 seconds +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 486 memory location(s), 2 block(s), and 842 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=486 blocks=2 instructions=842 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 486 memory location(s), 2 block(s), and 846 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5104 memory location(s), 2 block(s), and 25313 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=5104 blocks=2 instructions=25313 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 559 memory location(s), 2 block(s), and 1850 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.008 seconds +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5110 memory location(s), 2 block(s), and 25331 instruction(s). Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=5110 blocks=2 instructions=25331 Max writers: 298 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.026 seconds +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5110 memory location(s), 2 block(s), and 25335 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.041 seconds +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28031 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=265 blocks=1 instructions=896 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2362 blocks=1 instructions=12310 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: reserved space = 37120 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: spill space = 1835008 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 1835008 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: size = 4 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: reserved space = 33024 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: spill space = 1835008 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 1835008 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: spill space = 2621440 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 2621440 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: reserved space = 36864 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: spill space = 2621440 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 896 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 2621440 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: size = 5 +2025-11-04T21:38:47Z INFO 9015 []: find first defs for local +2025-11-04T21:38:47Z INFO 9015 []: find first defs for global +2025-11-04T21:38:47Z INFO 9015 []: find first defs for local +2025-11-04T21:38:47Z INFO 9015 []: find first defs for global +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: Num intervals 5 Num locations 5 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: lo = 5 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: total = 5 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 2097152 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 2097152 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 2621440 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: Num intervals 4 Num locations 4 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: lo = 4 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: total = 4 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 1835008 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 1835008 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 1835008 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.016 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: reserved space = 557056 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: spill space = 1891330 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 1937408 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.017 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12310 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: reserved space = 566284 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: spill space = 1891330 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 1937408 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: size = 18 +2025-11-04T21:38:47Z INFO 9015 []: find first defs for local +2025-11-04T21:38:47Z INFO 9015 []: find first defs for global +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: Num intervals 18 Num locations 18 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: lo = 18 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: total = 18 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 524288 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 524288 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 1589248 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 1589248 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 2113536 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.043 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.044 seconds +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28031 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=559 blocks=2 instructions=1850 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=486 blocks=2 instructions=846 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 486 memory location(s), 2 block(s), and 846 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=5110 blocks=2 instructions=25335 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (sg01) [SubgraphForkPass]: sync_shared_allocations finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 559 memory location(s), 2 block(s), and 1850 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (sg02) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5110 memory location(s), 2 block(s), and 25335 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28031 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2362 blocks=1 instructions=12310 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=265 blocks=1 instructions=896 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.005 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 896 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.022 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12310 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.026 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.027 seconds +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: curr_vmrss: 291mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28031 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:47Z USER 9015 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:47Z INFO 9015 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=3298 blocks=3 instructions=14421 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=2857 blocks=3 instructions=13610 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.148 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 2857 memory location(s), 3 block(s), and 13610 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.169 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3298 memory location(s), 3 block(s), and 14421 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: nc_parallel_pass finished after 0.170 seconds +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:47Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28031 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=265 blocks=1 instructions=896 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 896 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=265 blocks=1 instructions=896 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2362 blocks=1 instructions=12310 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12310 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:47Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2362 blocks=1 instructions=12310 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware simulation time: 136719 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware simulation time: 139728 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: post_sched finished after 0.022 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: post_sched finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9015 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware simulation time: 5607495 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware simulation time: 5839155 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: post_sched finished after 0.039 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: post_sched finished after 0.040 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 896 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=265 blocks=1 instructions=896 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 896 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=265 blocks=1 instructions=896 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z USER 9015 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:47Z USER 9015 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 895 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:47Z INFO 9015 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:48Z INFO 9015 [post_scheduler]: Time-aware simulation time: 1097093 +2025-11-04T21:38:48Z INFO 9015 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9015 (nc01/sg02) [ModuleForkPass]: post_sched finished after 0.674 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 327mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12310 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2362 blocks=1 instructions=12310 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc01/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 319mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12310 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2362 blocks=1 instructions=12310 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.017 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 319mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12305 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z INFO 9015 [post_scheduler]: Time-aware simulation time: 1273044 +2025-11-04T21:38:48Z INFO 9015 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: post_sched finished after 0.834 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.014 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:48Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.853 seconds +2025-11-04T21:38:48Z INFO 9015 [BackendPassManager]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:48Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:48Z USER 9015 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:48Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=486 blocks=2 instructions=846 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9015 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=559 blocks=2 instructions=1849 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 486 memory location(s), 2 block(s), and 846 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:48Z USER 9015 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9015 (sg01) [SubgraphForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 559 memory location(s), 2 block(s), and 1849 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5110 blocks=2 instructions=25330 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9015 (sg02) [SubgraphForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5110 memory location(s), 2 block(s), and 25330 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:48Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9015 [BackendPassManager]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:48Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2362 blocks=1 instructions=12305 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=265 blocks=1 instructions=895 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 19 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 34 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 14 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 19 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 36 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 26 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 33 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 11 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 31 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 315mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.010 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 315mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.023 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 895 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=265 blocks=1 instructions=895 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 13 PSUM Banks +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.004 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 13Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [build_flow_deps]: Allocs: 256 instructions: 442 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 976 edges +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [build_flow_deps]: Done build fdeps 976 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 3 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 4 │ 37120 │ +│ Load │ ExternalInput -> Internal │ 14 │ 6295044 │ +│ Load │ Internal │ 2 │ 786432 │ +│ Save │ Internal │ 2 │ 393216 │ +│ Save │ Internal -> Output │ 5 │ 303106 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 1 │ +│ 1024 │ 1 │ +│ 2048 │ 4 │ +│ 4096 │ 1 │ +│ 8192 │ 6 │ +│ 524288 │ 2 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 196 #MatMult-Transposes 44 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ReportStats]: IO Tensor size combined: 457971204 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input63 │ ExternalInput │ bfloat16 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input67_local_1253_i1 │ Internal │ bfloat16 │ 1048576 │ +│ input67_local_1253_i0 │ Internal │ bfloat16 │ 1048576 │ +│ input65_local_1269_i0 │ Internal │ bfloat16 │ 1048576 │ +│ input61_local_1294_i0 │ Internal │ bfloat16 │ 1048576 │ +│ input61_local_1294_i1 │ Internal │ bfloat16 │ 1048576 │ +│ input62_local_1283_i0 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate1 │ Output │ bfloat16 │ 524288 │ +│ dot.4-buffer-2425 │ Internal │ bfloat16 │ 524288 │ +│ all_gather.1 │ Internal │ bfloat16 │ 524288 │ +└───────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z USER 9015 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 442 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 14Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [build_flow_deps]: Allocs: 230 instructions: 404 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 26 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 883 edges +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [build_flow_deps]: Done build fdeps 883 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.009 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.010 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 895 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=265 blocks=1 instructions=895 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 3 │ 0 │ +│ Load │ Const -> Internal │ 2 │ 33024 │ +│ Load │ ExternalInput -> Internal │ 13 │ 6297088 │ +│ Load │ Internal │ 2 │ 786432 │ +│ Save │ Internal │ 2 │ 393216 │ +│ Save │ Internal -> Output │ 4 │ 303104 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 1 │ +│ 1024 │ 1 │ +│ 2048 │ 4 │ +│ 4096 │ 1 │ +│ 8192 │ 6 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 196 #MatMult-Transposes 44 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ReportStats]: IO Tensor size combined: 457971204 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input63 │ ExternalInput │ bfloat16 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input67_local_1253_i3 │ Internal │ bfloat16 │ 1048576 │ +│ input67_local_1253_i2 │ Internal │ bfloat16 │ 1048576 │ +│ input65_local_1269_i1 │ Internal │ bfloat16 │ 1048576 │ +│ input61_local_1294_i2 │ Internal │ bfloat16 │ 1048576 │ +│ input61_local_1294_i3 │ Internal │ bfloat16 │ 1048576 │ +│ input62_local_1283_i1 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate1 │ Output │ bfloat16 │ 524288 │ +│ dot.4-buffer-2425 │ Internal │ bfloat16 │ 524288 │ +│ all_gather.1 │ Internal │ bfloat16 │ 524288 │ +└───────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z USER 9015 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 404 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 895 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=265 blocks=1 instructions=895 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 15Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [build_flow_deps]: Allocs: 265 instructions: 895 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 2344 edges +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [build_flow_deps]: Done build fdeps 2344 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: dep_opt finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 895 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=265 blocks=1 instructions=895 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬───────────────────────────┬───────┬──────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼──────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 3 │ 0 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 32 │ 25174528 │ +│ Load │ Input -> Internal │ 3 │ 81920 │ +│ Load │ Internal │ 2 │ 1048576 │ +│ Save │ Internal │ 1 │ 262144 │ +└─────────────┴───────────────────────────┴───────┴──────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 3 │ +│ 2048 │ 1 │ +│ 4096 │ 2 │ +│ 6144 │ 8 │ +│ 8192 │ 18 │ +│ 262144 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ReportStats]: MM Stats: #MatMults 649 #MatMult-Transposes 50 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input68_local_1203_i7 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i6 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i5 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i4 │ Internal │ bfloat16 │ 1572864 │ +│ input69_local_1180_i11 │ Internal │ bfloat16 │ 1048576 │ +│ input71_local_1191_i6 │ Internal │ bfloat16 │ 1048576 │ +│ input69_local_1180_i10 │ Internal │ bfloat16 │ 1048576 │ +│ input71_local_1191_i7 │ Internal │ bfloat16 │ 1048576 │ +│ input69_local_1180_i9 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z USER 9015 (nc01/sg01) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 895 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.040 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.011 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.007 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 16Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [build_flow_deps]: Allocs: 294 instructions: 954 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 2508 edges +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [build_flow_deps]: Done build fdeps 2508 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: dep_opt finished after 0.004 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 4 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 3 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 33 │ 25174532 │ +│ Load │ Input -> Internal │ 3 │ 81920 │ +│ Load │ Internal │ 2 │ 1048576 │ +│ Save │ Internal │ 2 │ 786432 │ +│ Save │ Internal -> Output │ 2 │ 524290 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 3 │ +│ 2048 │ 1 │ +│ 4096 │ 4 │ +│ 6144 │ 8 │ +│ 8192 │ 18 │ +│ 262144 │ 3 │ +│ 524288 │ 2 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ReportStats]: MM Stats: #MatMults 665 #MatMult-Transposes 66 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input68_local_1203_i3 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i2 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i1 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i0 │ Internal │ bfloat16 │ 1572864 │ +│ input69_local_1180_i5 │ Internal │ bfloat16 │ 1048576 │ +│ input71_local_1191_i0 │ Internal │ bfloat16 │ 1048576 │ +│ input69_local_1180_i4 │ Internal │ bfloat16 │ 1048576 │ +│ input71_local_1191_i1 │ Internal │ bfloat16 │ 1048576 │ +│ input69_local_1180_i3 │ Internal │ bfloat16 │ 1048576 │ +└───────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:48Z USER 9015 (nc00/sg01) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 954 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 711 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 10 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 826 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 74 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 10 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 133 PSUM Banks +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 14 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 33 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.393 seconds +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 315mb, ru_maxrss: 327mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.100 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 328mb, ru_maxrss: 328mb (delta=1mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.496 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 328mb (delta=1mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12305 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2362 blocks=1 instructions=12305 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.014 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 328mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 17Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [build_flow_deps]: Allocs: 2748 instructions: 13025 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.102 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 331mb (delta=3mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12305 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2362 blocks=1 instructions=12305 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 41988 edges +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [build_flow_deps]: Done build fdeps 41988 Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.014 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12305 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2362 blocks=1 instructions=12305 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 18Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [build_flow_deps]: Allocs: 2362 instructions: 12305 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: dep_opt finished after 0.140 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=3mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 4 │ 524288 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174466060 │ +│ Load │ Internal │ 24 │ 1371018 │ +│ Save │ Internal │ 319 │ 1217031 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 304 │ +│ 1024 │ 17 │ +│ 2048 │ 1 │ +│ 4096 │ 301 │ +│ 6144 │ 8 │ +│ 8192 │ 12 │ +│ 9496 │ 2 │ +│ 262144 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 31708 edges +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [build_flow_deps]: Done build fdeps 31708 Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ReportStats]: MM Stats: #MatMults 10438 #MatMult-Transposes 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ReportStats]: IO Tensor size combined: 348922384 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 512 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input365_local_1014_i3 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i2 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i1 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i0 │ Internal │ bfloat16 │ 1572864 │ +│ input366_local_991_i1 │ Internal │ bfloat16 │ 1048576 │ +│ input366_local_991_i2 │ Internal │ bfloat16 │ 1048576 │ +│ -t3011 │ Internal │ float32 │ 1048576 │ +│ input366_local_991_i3 │ Internal │ bfloat16 │ 1048576 │ +│ -t3005 │ Internal │ float32 │ 1048576 │ +└────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: report_stats finished after 0.005 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: dep_opt finished after 0.061 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12305 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2362 blocks=1 instructions=12305 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 1 │ 524288 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174466060 │ +│ Load │ Internal │ 10 │ 1051782 │ +│ Save │ Internal │ 301 │ 677376 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 304 │ +│ 1024 │ 1 │ +│ 4096 │ 300 │ +│ 6144 │ 8 │ +│ 8192 │ 12 │ +│ 262144 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ReportStats]: MM Stats: #MatMults 10314 #MatMult-Transposes 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ReportStats]: IO Tensor size combined: 348922384 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 512 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input365_local_1014_i7 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i6 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i5 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i4 │ Internal │ bfloat16 │ 1572864 │ +│ input366_local_991_i11 │ Internal │ bfloat16 │ 1048576 │ +│ input368_local_1002_i6 │ Internal │ bfloat16 │ 1048576 │ +│ input366_local_991_i10 │ Internal │ bfloat16 │ 1048576 │ +│ input368_local_1002_i7 │ Internal │ bfloat16 │ 1048576 │ +│ input366_local_991_i9 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: report_stats finished after 0.004 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12305 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.681 seconds +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: curr_vmrss: 320mb, ru_maxrss: 331mb (delta=4mb) +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Inputs to assign_trigger_engine: modules=6 functions=6 allocs=6155 blocks=6 instructions=28025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 5 DMA instructions. Moved 3 DMA instructions to CC's engines. +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 4 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [AssignTriggerEngine]: Assigned trigger engine for 4 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [AssignTriggerEngine]: Assigned trigger engine for 2 DMA instructions. Moved 1 DMA instructions to CC's engines. +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [AssignTriggerEngine]: Assigned trigger engine for 323 DMA instructions. Moved 4 DMA instructions to CC's engines. +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [AssignTriggerEngine]: Assigned trigger engine for 303 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-11-04T21:38:49Z INFO 9015 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: assign_trigger_engine finished after 0.013 seconds +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Output has 6 module(s), 6 function(s), 6155 memory location(s), 6 block(s), and 28025 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=294 blocks=1 instructions=954 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=265 blocks=1 instructions=895 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=230 blocks=1 instructions=404 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2362 blocks=1 instructions=12305 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 406 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 897 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 956 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.003 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12308 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=256 blocks=1 instructions=442 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.003 seconds +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2748 blocks=1 instructions=13025 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.007 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 444 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13028 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.008 seconds +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=6 functions=6 allocs=6155 blocks=6 instructions=28039 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: assign_hwdge_engine finished after 0.004 seconds +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Output has 6 module(s), 6 function(s), 6155 memory location(s), 6 block(s), and 28039 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28039 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2748 blocks=1 instructions=13028 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2362 blocks=1 instructions=12308 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 7 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 298 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 2 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 1 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 3 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 320 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 1 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 7 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 10 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 9 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 24 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 301 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 9 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 320 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 1 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13028 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2748 blocks=1 instructions=13028 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=294 blocks=1 instructions=956 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=230 blocks=1 instructions=406 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 1 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 3 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 17 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 3 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 2 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 4 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 4 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 6 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 44 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 1 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 406 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=230 blocks=1 instructions=406 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 956 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=294 blocks=1 instructions=956 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 956 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 406 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=256 blocks=1 instructions=444 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=265 blocks=1 instructions=897 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 4 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 3 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 25 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 5 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 2 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 444 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 3 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 3 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 35 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 1 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=256 blocks=1 instructions=444 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 897 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 444 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=265 blocks=1 instructions=897 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 897 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: alloc_queues finished after 0.007 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12308 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2362 blocks=1 instructions=12308 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.005 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13028 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.003 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12308 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.010 seconds +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28039 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:49Z USER 9015 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:49Z INFO 9015 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=2857 blocks=3 instructions=13611 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=3298 blocks=3 instructions=14428 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 2857 memory location(s), 3 block(s), and 13611 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3298 memory location(s), 3 block(s), and 14428 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: nc_parallel_pass finished after 0.005 seconds +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28039 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=256 blocks=1 instructions=444 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=265 blocks=1 instructions=897 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 444 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=256 blocks=1 instructions=444 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 897 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=265 blocks=1 instructions=897 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=294 blocks=1 instructions=956 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 956 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=294 blocks=1 instructions=956 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 444 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=256 blocks=1 instructions=444 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: lower_control finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 897 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=265 blocks=1 instructions=897 Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 256 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2362 blocks=1 instructions=12308 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2748 blocks=1 instructions=13028 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 277 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 277 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12308 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13028 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: lower_control finished after 0.001 seconds +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 956 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=294 blocks=1 instructions=956 Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2362 blocks=1 instructions=12308 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2748 blocks=1 instructions=13028 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=230 blocks=1 instructions=406 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 406 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=230 blocks=1 instructions=406 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 767 +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 406 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=230 blocks=1 instructions=406 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 232 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 252 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 252 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Finished dependency reduction: 1743 removed, new total 234 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:49Z USER 9015 (nc00/sg00) [ModuleForkPass]: dep_reduction finished after 0.004 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 256 memory location(s), 1 block(s), and 444 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Finished dependency reduction: 1604 removed, new total 209 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:49Z USER 9015 (nc01/sg00) [ModuleForkPass]: dep_reduction finished after 0.003 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 230 memory location(s), 1 block(s), and 406 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 807 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 807 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 830 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 882 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 882 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Finished dependency reduction: 4594 removed, new total 257 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:49Z USER 9015 (nc01/sg01) [ModuleForkPass]: dep_reduction finished after 0.012 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 897 instruction(s). Max writers: 16 Max Readers: 50 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Finished dependency reduction: 4825 removed, new total 294 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:49Z USER 9015 (nc00/sg01) [ModuleForkPass]: dep_reduction finished after 0.012 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 294 memory location(s), 1 block(s), and 956 instruction(s). Max writers: 16 Max Readers: 66 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: lower_control finished after 0.032 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12308 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2362 blocks=1 instructions=12308 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: lower_control finished after 0.036 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13028 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2748 blocks=1 instructions=13028 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 10816 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 11440 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 11440 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 11356 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 12346 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 12346 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Finished dependency reduction: 53178 removed, new total 3006 +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:49Z USER 9015 (nc01/sg02) [ModuleForkPass]: dep_reduction finished after 0.187 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2362 memory location(s), 1 block(s), and 12308 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Finished dependency reduction: 65914 removed, new total 3826 +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:49Z USER 9015 (nc00/sg02) [ModuleForkPass]: dep_reduction finished after 0.229 seconds +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9015 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2748 memory location(s), 1 block(s), and 13028 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.289 seconds +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: curr_vmrss: 329mb, ru_maxrss: 331mb (delta=0mb) +2025-11-04T21:38:49Z USER 9015 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:49Z INFO 9015 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=6155 blocks=6 instructions=28039 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z USER 9015 (nc01) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:49Z INFO 9015 (nc01) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=2857 blocks=3 instructions=13611 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: Num intermediates 87 +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:49Z USER 9015 (nc00) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:49Z INFO 9015 (nc00) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=3298 blocks=3 instructions=14428 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: DMA Descriptor ReUse Enabled. +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: Num intermediates 87 +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: Added a new SpillReload Que qSPPIOParam0 +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc/nc01/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: PostLink Stats: #MatMults 28033 #MatMult-Transposes 6492 +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: Total Intermediate MMTs 54 #out: 0 #inp: 54 #symmetric: 0 +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc/nc00/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: PostLink Stats: #MatMults 28589 #MatMult-Transposes 6924 +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: Total Intermediate MMTs 54 #out: 0 #inp: 54 #symmetric: 0 +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:49Z INFO 9015 (nc00/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:49Z INFO 9015 (nc01/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:49Z USER 9015 (nc01) [CoreForkPass]: bir_linker finished after 0.406 seconds +2025-11-04T21:38:49Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 472mb, ru_maxrss: 472mb (delta=141mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 210640786, 97.8529% input load, 0.143896% output write, 2.00319% spill/reload +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: postlnk_dma_report finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running report_stats +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 3 │ 0 │ +│ Load │ Const -> Internal │ 2 │ 33024 │ +│ Load │ ExternalInput -> Internal │ 13 │ 6297088 │ +│ Load │ Internal │ 2 │ 786432 │ +│ Save │ Internal │ 2 │ 393216 │ +│ Save │ Internal -> Output │ 4 │ 303104 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 1 │ +│ 1024 │ 1 │ +│ 2048 │ 4 │ +│ 4096 │ 1 │ +│ 8192 │ 6 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬───────────────────────────┬───────┬──────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼──────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 3 │ 0 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 32 │ 25174528 │ +│ Load │ Input -> Internal │ 3 │ 81920 │ +│ Load │ Internal │ 2 │ 1048576 │ +│ Save │ Internal │ 1 │ 262144 │ +└─────────────┴───────────────────────────┴───────┴──────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 3 │ +│ 2048 │ 1 │ +│ 4096 │ 2 │ +│ 6144 │ 8 │ +│ 8192 │ 18 │ +│ 262144 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 1 │ 524288 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174466060 │ +│ Load │ Internal │ 10 │ 1051782 │ +│ Save │ Internal │ 301 │ 677376 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 304 │ +│ 1024 │ 1 │ +│ 4096 │ 300 │ +│ 6144 │ 8 │ +│ 8192 │ 12 │ +│ 262144 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: MM Stats: #MatMults 11159 #MatMult-Transposes 5192 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: IO Tensor size combined: 6781412908 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input365_local_1014_i4_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i5_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i7_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i5_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i7_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i6_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i6_sg0001 │ Internal │ bfloat16 │ 1572864 │ +└───────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: report_stats finished after 0.014 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: Real CC buffer size 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.033 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: spill space = 29442104 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 29556736 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: bir_linker finished after 0.474 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=141mb) +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 212870581, 96.9793% input load, 0.388687% output write, 2.63198% spill/reload +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: postlnk_dma_report finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running report_stats +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 3 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 4 │ 37120 │ +│ Load │ ExternalInput -> Internal │ 14 │ 6295044 │ +│ Load │ Internal │ 2 │ 786432 │ +│ Save │ Internal │ 2 │ 393216 │ +│ Save │ Internal -> Output │ 5 │ 303106 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 2 │ +│ 16 │ 3 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 512 │ 1 │ +│ 1024 │ 1 │ +│ 2048 │ 4 │ +│ 4096 │ 1 │ +│ 8192 │ 6 │ +│ 524288 │ 2 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 4 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 3 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 33 │ 25174532 │ +│ Load │ Input -> Internal │ 3 │ 81920 │ +│ Load │ Internal │ 2 │ 1048576 │ +│ Save │ Internal │ 2 │ 786432 │ +│ Save │ Internal -> Output │ 2 │ 524290 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 2 │ +│ 128 │ 1 │ +│ 256 │ 3 │ +│ 2048 │ 1 │ +│ 4096 │ 4 │ +│ 6144 │ 8 │ +│ 8192 │ 18 │ +│ 262144 │ 3 │ +│ 524288 │ 2 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 786432 │ +│ DMACopy │ Internal │ 4 │ 524288 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174466060 │ +│ Load │ Internal │ 24 │ 1371018 │ +│ Save │ Internal │ 319 │ 1217031 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 304 │ +│ 1024 │ 17 │ +│ 2048 │ 1 │ +│ 4096 │ 301 │ +│ 6144 │ 8 │ +│ 8192 │ 12 │ +│ 9496 │ 2 │ +│ 262144 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: MM Stats: #MatMults 11299 #MatMult-Transposes 5208 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: IO Tensor size combined: 6781412908 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input365_local_1014_i0_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i1_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i3_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i1_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i3_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1014_i2_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1203_i2_sg0001 │ Internal │ bfloat16 │ 1572864 │ +└───────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: report_stats finished after 0.017 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.045 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.042 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: spill space = 29442104 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 29556736 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: size = 87 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 87 Num locations 87 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: lo = 87 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: total = 87 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 2621440 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 5337088 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.040 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: nc_parallel_pass finished after 0.606 seconds +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=141mb) +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=7185 blocks=8 instructions=28123 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:50Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=8 allocs=7185 blocks=8 instructions=28123 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 7185 memory location(s), 8 block(s), and 28123 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=8 allocs=7185 blocks=8 instructions=28123 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.021 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.024 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.017 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.011 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13653 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=3372 blocks=4 instructions=13653 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14470 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13660 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=3813 blocks=4 instructions=14470 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=3372 blocks=4 instructions=13660 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14477 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=3813 blocks=4 instructions=14477 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1110/1110 (100% DGE) + power-of-2 partition : 1110/1115 (99.5516% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1111/1116 (99.552% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 117/117 (100% DGE) + power-of-2 partition : 117/426 (27.4648% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 117/426 (27.4648% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 112 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 1/1 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: lower_dma finished after 0.041 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13660 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=3372 blocks=4 instructions=13660 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: expand_all_engine finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13660 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=3372 blocks=4 instructions=13660 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1137/1137 (100% DGE) + power-of-2 partition : 1137/1227 (92.665% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1138/1228 (92.671% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 206/206 (100% DGE) + power-of-2 partition : 206/548 (37.5912% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 206/548 (37.5912% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 140 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 225/225 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: lower_dma finished after 0.054 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14477 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=3813 blocks=4 instructions=14477 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: expand_all_engine finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14477 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=3813 blocks=4 instructions=14477 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.021 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13660 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=3372 blocks=4 instructions=13660 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: expand_inst_late finished after 0.017 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13666 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=3372 blocks=4 instructions=13666 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [SeqInstOpt]: Removing 1 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 13665 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=3372 blocks=4 instructions=13665 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.028 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14477 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=3813 blocks=4 instructions=14477 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: lower_sync finished after 0.007 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 14367 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=3372 blocks=4 instructions=14367 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: lower_act finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 383mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 14378 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=3372 blocks=4 instructions=14378 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: expand_inst_late finished after 0.019 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 384mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14499 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=3813 blocks=4 instructions=14499 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [SeqInstOpt]: Removing 8 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [SeqInstOpt]: Removing 7 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 384mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 14484 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=3813 blocks=4 instructions=14484 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: lower_sync finished after 0.013 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 385mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 15351 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=3813 blocks=4 instructions=15351 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: lower_act finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 385mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 15363 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=3813 blocks=4 instructions=15363 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: lower_dve finished after 0.063 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 387mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 14378 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=3372 blocks=4 instructions=14378 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: lower_ap finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 387mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 14378 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=3372 blocks=4 instructions=14378 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: size = 2 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local reg +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global reg +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: lo = 2 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: total = 2 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local reg +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global reg +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: lower_dve finished after 0.103 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 15363 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=3813 blocks=4 instructions=15363 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: lower_ap finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 15363 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=3813 blocks=4 instructions=15363 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local reg +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global reg +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: size = 1 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local reg +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global reg +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: lo = 1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: total = 1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:50Z INFO 9015 []: find first defs for local reg +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:50Z USER 9015 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.122 seconds +2025-11-04T21:38:50Z INFO 9015 []: find first defs for global reg +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 14378 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:50Z USER 9015 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.101 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 15363 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: nc_parallel_pass finished after 0.402 seconds +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: vnc_remote_addr_map finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7185 memory location(s), 8 block(s), and 29741 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: Running vnc_link +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 [VncLink]: Found 0 remote updates +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: vnc_link finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7185 memory location(s), 8 block(s), and 29741 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z USER 9015 (nc01/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=3813 blocks=4 instructions=15363 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=3372 blocks=4 instructions=14378 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00/sgLnk) [ModuleForkPass]: birverifier finished after 0.091 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 15363 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc01/sgLnk) [ModuleForkPass]: birverifier finished after 0.093 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 14378 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.094 seconds +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:50Z INFO 9015 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.017 seconds +2025-11-04T21:38:50Z INFO 9015 (sg00) [SubgraphForkPass]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9015 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 7185 memory location(s), 8 block(s), and 29741 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: subgraph_parallel_pass finished after 0.024 seconds +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: curr_vmrss: 388mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:50Z USER 9015 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9015 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z USER 9015 (nc00/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:50Z USER 9015 (nc01/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=3813 blocks=4 instructions=15363 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=3372 blocks=4 instructions=14378 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89232 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000380516 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: +┌────────────────┬────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼────────────┤ +│ ExternalInput │ 1.89232 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.00039006 │ +└────────────────┴────────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 11246 │ +│ LDWEIGHTS │ 11246 │ +│ EVENT_SEMAPHORE │ 702 │ +│ CAST │ 680 │ +│ COPY │ 632 │ +│ UNKNOWN(0xd4) │ 385 │ +│ PSEUDO_DMA_TRIGGER │ 323 │ +│ ACTIVATE │ 188 │ +│ UNKNOWN(0xd3) │ 145 │ +│ TENSOR_TENSOR │ 109 │ +│ UNKNOWN(0xd8) │ 57 │ +│ MEMSET │ 22 │ +│ TENSOR_SCALAR │ 21 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 11 │ +│ TENSOR_REDUCE │ 11 │ +│ UNKNOWN(0xda) │ 9 │ +│ UNKNOWN(0xd9) │ 7 │ +│ TENSOR_SCALAR_ADDR │ 7 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ UNKNOWN(0xe8) │ 5 │ +│ IOTA │ 3 │ +│ RECIPROCAL │ 3 │ +│ ALU_OP │ 2 │ +│ MOVE │ 2 │ +│ UNKNOWN(0x92) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 801 │ +│ Scalar │ 2193 │ +│ Tensor │ 22572 │ +│ SyncDMA │ 0 │ +│ Vector │ 251 │ +│ Sync │ 71 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:50Z USER 9015 (nc01/sgLnk) [Codegen]: isa_gen finished after 0.235 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 11510 │ +│ LDWEIGHTS │ 11510 │ +│ EVENT_SEMAPHORE │ 867 │ +│ COPY │ 764 │ +│ CAST │ 680 │ +│ UNKNOWN(0xd4) │ 413 │ +│ PSEUDO_DMA_TRIGGER │ 367 │ +│ POOL_BUFFER_LOAD │ 291 │ +│ GATHER │ 291 │ +│ ACTIVATE │ 195 │ +│ UNKNOWN(0xd3) │ 145 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ TENSOR_TENSOR │ 111 │ +│ UNKNOWN(0xd8) │ 57 │ +│ TENSOR_SCALAR_ADDR │ 41 │ +│ MEMSET │ 36 │ +│ UNKNOWN(0xda) │ 25 │ +│ TENSOR_SCALAR │ 25 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ TENSOR_REDUCE │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ STREAM_SHUFFLE │ 12 │ +│ ACT_TABLE_LOAD │ 12 │ +│ LOAD_MASK_SELECT │ 12 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ UNKNOWN(0xe8) │ 7 │ +│ RECIPROCAL │ 5 │ +│ MOVE │ 4 │ +│ IOTA │ 3 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0x92) │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 1545 │ +│ Scalar │ 2341 │ +│ Tensor │ 23101 │ +│ SyncDMA │ 0 │ +│ Vector │ 893 │ +│ Sync │ 116 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 596 │ +│ qDVESpillReload0_defId_2 │ 2 │ +│ qPoolSpillReload0_defId_0 │ 1152 │ +│ qPoolSpillReload0_defId_1 │ 1152 │ +│ qPoolSpillReload0_defId_2 │ 4 │ +│ qSPIO0 │ 5388 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 14 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 8310 (0.000123829 GB) +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qPoolDynamic │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ 947.1554_i349_sg0002 │ Internal │ float32 │ 1 │ +│ 947.1554_i486_sg0002 │ Internal │ float32 │ 1 │ +│ 947.1554_i479_sg0002 │ Internal │ float32 │ 1 │ +│ 947.1554_i491_sg0002 │ Internal │ float32 │ 1 │ +│ get_tuple_element.5_sg0002 │ Internal │ float32 │ 2 │ +│ scatter.1_sg0002 │ Internal │ uint8 │ 2 │ +│ input1 │ ExternalInput │ int32 │ 3 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ convert.59_sg0002 │ Internal │ float32 │ 297 │ +└────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:50Z USER 9015 (nc00/sgLnk) [Codegen]: isa_gen finished after 0.243 seconds +2025-11-04T21:38:50Z USER 9015 (nc01/sgLnk) [Codegen]: dma_desc_gen finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 602 │ +│ qDVESpillReload0_defId_2 │ 142 │ +│ qPoolSpillReload0_defId_0 │ 1152 │ +│ qPoolSpillReload0_defId_1 │ 1536 │ +│ qPoolSpillReload0_defId_2 │ 588 │ +│ qSPIO0 │ 5894 │ +│ qSPPIOParam0 │ 56 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 358 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 10330 (0.000153929 GB) +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qPoolDynamic │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qSPPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 144 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ get_tuple_element.5_sg0002 │ Internal │ float32 │ 3 │ +│ scatter.1_sg0002 │ Internal │ uint8 │ 3 │ +│ input1 │ ExternalInput │ int32 │ 3 │ +│ all-reduce.531.2235_sg0001 │ Internal │ bfloat16 │ 27 │ +│ intermediate7-buffer-2228_sg0001 │ Internal │ bfloat16 │ 27 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ compare.2.1610_sg0001 │ Internal │ int32 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.59_sg0002 │ Internal │ float32 │ 298 │ +└──────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:50Z USER 9015 (nc00/sgLnk) [Codegen]: dma_desc_gen finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9015 (nc00/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:50Z USER 9015 (nc01/sgLnk) [Codegen]: debug_info_gen finished after 0.029 seconds +2025-11-04T21:38:50Z USER 9015 (nc01/sgLnk) [ModuleForkPass]: codegen finished after 0.281 seconds +2025-11-04T21:38:50Z INFO 9015 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 401mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9015 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 3372 memory location(s), 4 block(s), and 14378 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:51Z USER 9015 (nc00/sgLnk) [Codegen]: debug_info_gen finished after 0.030 seconds +2025-11-04T21:38:51Z USER 9015 (nc00/sgLnk) [ModuleForkPass]: codegen finished after 0.294 seconds +2025-11-04T21:38:51Z INFO 9015 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 401mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9015 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 3813 memory location(s), 4 block(s), and 15363 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:51Z USER 9015 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:51Z USER 9015 [BackendPassManager]: mod_parallel_pass finished after 0.297 seconds +2025-11-04T21:38:51Z INFO 9015 [BackendPassManager]: curr_vmrss: 401mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:51Z USER 9015 [BackendPassManager]: Running hbm_usage +2025-11-04T21:38:51Z INFO 9015 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:51Z INFO 9015 (nc00/sgLnk) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 8.094KB │ 21.312KB │ +│ CCE │ 84.000KB │ 48.000KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 22.750KB │ 95.000KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:38:51Z INFO 9015 (nc00/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.650GB │ +│ Model Code │ 1.709MB │ +│ Model Constants │ 409.008KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 5.090MB │ +│ DMA Ring IO │ 114.844KB │ +│ DMA Ring Spill │ 164.312KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:51Z INFO 9015 (nc01/sgLnk) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 192.000B │ 9.656KB │ +│ CCE │ 84.000KB │ 36.000KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 8.500KB │ 78.750KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:38:51Z INFO 9015 (nc01/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.647GB │ +│ Model Code │ 1.580MB │ +│ Model Constants │ 399.000KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 2.500MB │ +│ DMA Ring IO │ 92.688KB │ +│ DMA Ring Spill │ 124.406KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:51Z INFO 9015 [HBMUsage]: Total estimated HBM usage is: 3.654GB +2025-11-04T21:38:51Z USER 9015 [BackendPassManager]: hbm_usage finished after 0.003 seconds +2025-11-04T21:38:51Z INFO 9015 [BackendPassManager]: curr_vmrss: 401mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9015 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7185 memory location(s), 8 block(s), and 29741 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:51Z USER 9015 [BackendPassManager]: Running neff_packager +2025-11-04T21:38:51Z INFO 9015 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=8 allocs=7185 blocks=8 instructions=29741 Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1460_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.4-1302-1462_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1310-1464_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1778_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.13-1295-1480_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1303-1482_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1630_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.27-1067-1292_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1519_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1460_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1778_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1630_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1519_CRSM.npy +2025-11-04T21:38:51Z INFO 9015 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:51Z WARNING 9015 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc/metrics.json +2025-11-04T21:38:51Z WARNING 9015 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:38:51Z INFO 9015 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff +2025-11-04T21:38:51Z INFO 9015 [NeffFileWriter]: IR signature: ce5705d01c4c23d66db9e2bdbf68c9a4 for neff artifacts +2025-11-04T21:38:51Z USER 9015 [BackendPassManager]: neff_packager finished after 0.172 seconds +2025-11-04T21:38:51Z INFO 9015 [BackendPassManager]: curr_vmrss: 401mb, ru_maxrss: 472mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9015 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7185 memory location(s), 8 block(s), and 29741 instruction(s). Max writers: 299 Max Readers: 5098 +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.001709 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.001709 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local and shared │ 0.002441 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: shared │ 0.002441 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.000488 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local and shared │ 0.001968 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.000515 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: shared │ 0.001804 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000488 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.002441 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.004971 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.027527 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg02 │ Peak scratchpad usage: local │ 0.000488 GB │ +│ nc01 │ sg02 │ Total size of allocated tensors: local │ 0.000488 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000488 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.004971 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ dot.4 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.500000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg02, addr_space=local (complete data located at nc00/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ -t2507 │ float32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg02, addr_space=shared (complete data located at nc00/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ dot.11 │ bfloat16 │ 1 │ 0.000001 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.000001 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc00 (complete data located at nc00//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate1 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.031250 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 0.031250 MB │ +│ intermediate0 │ uint8 │ 1 │ 0.015625 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate8 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg02, addr_space=local (complete data located at nc01/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.57 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc01 (complete data located at nc01//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate1 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.031250 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 0.031250 MB │ +│ intermediate0 │ uint8 │ 1 │ 0.015625 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate8 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:51Z INFO 9015 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:38:51Z INFO 8473 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:38:51Z INFO 8473 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:38:51Z INFO 8473 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc +2025-11-04T21:38:51Z INFO 8473 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:38:51Z INFO 8473 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:38:51Z INFO 8473 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:38:51Z INFO 8473 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:38:51Z INFO 8473 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:38:51Z INFO 8473 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk0/neuronxcc-ghdy2ddc/hlo_netlist.json +2025-11-04T21:38:51Z INFO 8473 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:38:51Z INFO 8473 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:38:51Z INFO 8473 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:38:51Z INFO 8460 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk0/metaneff.pb b/context_encoding_model/_tp0_bk0/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..113586fc122818786cb3c53df820f1e2ed6c0293 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec75ee80b2ec3909e8e315fa6044902ec93fdb3a62229b909f551426d04c56b6 +size 2077993 diff --git a/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb b/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..004eaa3e077332f604f64a5bcad9b5b276b9bb28 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b77f309407f7c741dd9b51614fc850fa657ce4e6ca40a18b4471f2b477760976 +size 2163092 diff --git a/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff b/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff new file mode 100644 index 0000000000000000000000000000000000000000..e8dd34ee8fa2badd9cd021fabb7fd1d836f71e95 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/model.MODULE_e7e1b6c43bb87ca73ecc+2ee9f01d.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98834cf4cd3214e9f9fc84530eed5ef31b01fda5919c60b959ca4a30bcb80d0c +size 1188864 diff --git a/context_encoding_model/_tp0_bk0/neuron_config.json b/context_encoding_model/_tp0_bk0/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0d7b2beb8bcff4056d57dbaa3bdb8eba338da9d8 --- /dev/null +++ b/context_encoding_model/_tp0_bk0/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 128 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 128 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 4096, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk1/command.txt b/context_encoding_model/_tp0_bk1/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f5349f21a9f61f76c9f72a148af8d0017f8c806 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb --output model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk1/compile_flags.MODULE_2330bfb0632c950ddab1+62ecd68b.json b/context_encoding_model/_tp0_bk1/compile_flags.MODULE_2330bfb0632c950ddab1+62ecd68b.json new file mode 100644 index 0000000000000000000000000000000000000000..00a72c6cc3dcbfbe7b582279963f51f7967117ed --- /dev/null +++ b/context_encoding_model/_tp0_bk1/compile_flags.MODULE_2330bfb0632c950ddab1+62ecd68b.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk1/global_metric_store.json b/context_encoding_model/_tp0_bk1/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..d8395df8e0eeb6db02372905241115dd86a3bf89 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/global_metric_store.json @@ -0,0 +1,1177 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.71436309814453, + "StaticProfiler::AveragePartitionUtilization": 94.08551025390625, + "StaticProfiler::AveragePeUtilization": 96.60899353027344, + "StaticProfiler::LocalizationEfficiency": 95.931884765625, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.52960968017578, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.08984947204589844, + "AffinePredicateResolution": 0.0009312629699707031, + "AliasDependencyElimination": 0.00024366378784179688, + "AliasDependencyInduction": 0.005263328552246094, + "AliasDependencyReset": 0.04176759719848633, + "BFComputeCutting": 0.002216339111328125, + "BirCodeGenLoop": 0.3660314083099365, + "CCOpFusion": 0.04759931564331055, + "CanonicalizeConv": 4.999999873689376e-05, + "CanonicalizeDAGForPGTiling": 0.006819009780883789, + "CanonicalizeForTensorizer": 3.7000001611886546e-05, + "CanonicalizeIR": 0.0015099048614501953, + "Canonicalizer": 0.0008099999977275729, + "CoalesceCCOp": 0.014320611953735352, + "CommuteConcat": 0.0021598339080810547, + "DMALocalityOpt": 0.006499767303466797, + "DMAProfiler": 0.031740665435791016, + "DMATilingProfiler": 0.007287263870239258, + "DataLocalityOpt": 0.15184760093688965, + "DataStreaming": 0.030707597732543945, + "DeConcat": 0.0052378177642822266, + "DeadCodeElimination": 0.0020182132720947266, + "DeadStoreElimination": 0.007268428802490234, + "DelinearIndices": 0.006491422653198242, + "Delinearization": 0.00418853759765625, + "DelinearizeSPMD": 0.03150320053100586, + "DoNothing": 0.0004954338073730469, + "DramToDramTranspose": 0.028717756271362305, + "DumpGraphAndMetadata": 0.04632568359375, + "EliminateDivs": 0.0021729469299316406, + "ExpandBatchNorm": 0.0017549991607666016, + "ExpandISAMacro": 0.01276254653930664, + "FactorizeBlkDims": 0.07627987861633301, + "FactorizeThreadAxesInFreeDims": 0.0036237239837646484, + "FlattenMacroLoop": 0.012475728988647461, + "GenericAccessSimplifier": 0.0007128715515136719, + "HoistCompute": 9.999999747378752e-06, + "IdentifyCrossPassTensors": 3.899999865097925e-05, + "InferInitValue": 0.11746096611022949, + "InferIntrinsicOnCC": 0.008626222610473633, + "InferNeuronTensor": 0.17520785331726074, + "InferNonlocalTensors": 0.02865004539489746, + "InferPSumTensor": 0.097686767578125, + "InferShardAxis": 0.2832298278808594, + "InferSharedMemLoc": 0.024610280990600586, + "InlineNativeKernels": 0.0025413036346435547, + "InsertCoreBarrier": 0.014633417129516602, + "InsertIOTransposes": 0.058136701583862305, + "InsertImplicitShardAxisBeforeISel": 0.024377822875976563, + "InsertLocalTransposes": 0.016265153884887695, + "InsertOffloadedTransposes": 0.03376030921936035, + "LICM": 0.015621185302734375, + "LateLegalizeInst": 0.037809133529663086, + "LateLegalizePostSplit": 0.01734447479248047, + "LateLowerReshapeOp": 0.0016047954559326172, + "LateLowerTensorOp": 0.0011878013610839844, + "LateNeuronInstComb": 0.07452130317687988, + "LayoutPreprocessing": 0.05620622634887695, + "LayoutPreprocessingAndAnalysis": 0.18100428581237793, + "LayoutRequirementAnalysis": 0.014584064483642578, + "LegalizeCCOpLayout": 0.0032541751861572266, + "LegalizeOpLevelAlias": 0.0010030269622802734, + "LegalizePartitionReduce": 0.002452373504638672, + "LegalizeSundaAccess": 0.07152366638183594, + "LegalizeSundaMacro": 0.0427708625793457, + "LegalizeType": 0.03647494316101074, + "LocalLayoutOpt": 0.014898538589477539, + "LoopFusion": 0.005176067352294922, + "LoopSplitting": 0.00048732757568359375, + "LowerBroadcast": 0.019514799118041992, + "LowerCCOpBlockAxis": 0.004888296127319336, + "LowerComplexBroadcast": 0.010831594467163086, + "LowerIntrinsics": 0.05155062675476074, + "LowerShardAxis": 0.017355918884277344, + "LowerTensorOp": 0.013428449630737305, + "LowerToSendRecv": 0.038613319396972656, + "LowerTranspose": 0.058027029037475586, + "MacroGeneration": 0.1058506965637207, + "MaskPropagation": 0.004538536071777344, + "MemcastMotion": 2.2000000171829015e-05, + "MemcpyElimination": 0.04629826545715332, + "MutateDataType": 0.0012559890747070313, + "NeuronAliasDependencyInduction": 0.0006165504455566406, + "NeuronAliasDependencyReset": 0.03877615928649902, + "NeuronInstComb": 0.05556750297546387, + "NeuronLICM": 0.04741477966308594, + "NeuronLoopFusion": 0.08438324928283691, + "NeuronLoopInterchange": 0.0028100013732910156, + "NeuronSimplifier": 0.0370326042175293, + "NeuronSimplifyPredicates": 0.029002904891967773, + "NeuronValueNumbering": 0.014310836791992188, + "OptimizeAliasedCopyChain": 0.0005040168762207031, + "OptimizeNKIKernels": 4.637849807739258, + "PAGLayoutOpt": 0.15427088737487793, + "PComputeCutting": 0.022019147872924805, + "PGLayoutTilingPipeline": 1.5585658550262451, + "PGTiling": 0.3059046268463135, + "PadElimination": 0.00058746337890625, + "ParAxesAnnotation": 0.07737350463867188, + "PartialLoopFusion": 0.03046131134033203, + "PartialSimdFusion": 0.008630514144897461, + "PenguinizeFunctions": 3.699999797390774e-05, + "PerfectLoopNest": 0.0037374496459960938, + "PruneFunctions": 4.600000102072954e-05, + "RecognizeOpIdiom": 0.0049936771392822266, + "Recompute": 0.0004494190216064453, + "RelaxPredicates": 0.00769495964050293, + "Rematerialization": 0.0034401416778564453, + "RemoveOptimizationBarriers": 4.8000001697801054e-05, + "RemoveShardedPartitionAxes": 0.008293628692626953, + "ReshapeWeights": 0.004475116729736328, + "ResolveAccessConflict": 0.0053598880767822266, + "ResolveComplicatePredicates": 0.0009164810180664063, + "RewriteReplicationMatmul": 0.00577545166015625, + "RewriteWeights": 0.010277271270751953, + "SFKVectorizer": 0.2676401138305664, + "ScatterMotion": 3.199999991920777e-05, + "ShardingPropagationAnalysis": 0.06793785095214844, + "SimpleAllReduceTiling": 0.011077165603637695, + "Simplifier": 0.0029976367950439453, + "SimplifyMacroPredicates": 0.025454998016357422, + "SimplifyNeuronTensor": 0.13071107864379883, + "SimplifySlice": 0.0008246898651123047, + "SimplifyTensor": 0.03260469436645508, + "SpillPSum": 0.0713953971862793, + "SplitAPUnionSets": 0.08632850646972656, + "SplitAccGrp": 0.002518892288208008, + "StaticProfiler": 0.026699542999267578, + "StaticTransposeLocalTensor": 0.009710550308227539, + "SundaISel": 0.08615612983703613, + "TCTransform": 0.0014863014221191406, + "TensorInitialization": 0.017354965209960938, + "TensorOpSimplifier": 0.004897356033325195, + "TensorOpTransform": 0.026237010955810547, + "TensorizerLegalizationPass": 4.099999932805076e-05, + "TileCCOps": 0.007733821868896484, + "TilingProfiler": 0.03455352783203125, + "TransformConvOp": 0.0042724609375, + "TritiumFusion": 0.11825895309448242, + "ValueNumbering": 0.0019876956939697266, + "VectorizeDMA": 0.03213214874267578, + "VectorizeMatMult": 0.010382413864135742, + "VerifySupportedOps": 3.300000025774352e-05, + "WeightCoalescing": 0.010597944259643555, + "ZeroSizeTensorElimination": 0.00017881393432617188, + "algsimp": 0.0017300000181421638, + "batchnorm_expander": 3.5000000934815034e-05, + "boundary-marker-removal": 1.2000000424450263e-05, + "call-inliner": 0.00022000000171829015, + "canonicalize-boundary-marker": 2.2000000171829015e-05, + "collective-stream-id-checker": 6.299999949987978e-05, + "comparison-expander": 0.0005039999959990382, + "computation-deduplicator": 5.8999998145736754e-05, + "config-lowering": 9.800000407267362e-05, + "constant-statistics": 0.0004199999966658652, + "constant_folding": 0.00015699998766649514, + "cse": 3.699999797390774e-05, + "dce": 4.099999932805076e-05, + "dot_decomposer": 0.0009689999860711396, + "dynamic-slice-transpose": 1.4999999621068127e-05, + "eliminate-redundant-compare": 0.00013899999612476677, + "emit-offloaded-dropout": 3.900000228895806e-05, + "flatten-call-graph": 0.0006180000491440296, + "fuse-send-recv": 5.7999997807201e-05, + "hilo-conditional-to-select": 1.2999999853491317e-05, + "hilo::LegalizeAlias": 1.1000000085914508e-05, + "hilo::NeuronInstCombine": 0.0001770000089891255, + "hilo::NeuronOpFusion": 3.7999998312443495e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 4.199999966658652e-05, + "hilo::ScheduleFusion": 3.999999989900971e-06, + "hilo::SixtyFourHack": 6.199999916134402e-05, + "hilo::VerifyAliasing": 4.999999873689376e-06, + "hlo-mac-count": 0.011359000578522682, + "instruction-histogram": 0.0004990000161342323, + "io-con-pipe-begin": 3.999999989900971e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0007779999868944287, + "io-statistics": 4.099999932805076e-05, + "legalize-ccops-for-tensorizer": 3.999999989900971e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 9.999999747378752e-06, + "map-inline": 0.0007570000016130507, + "metadata-naming": 4.8000001697801054e-05, + "mlir::detail::OpToOpPassAdaptor": 6.500000017695129e-05, + "mlir::hlo::MhloToPyPenguin": 0.006823000032454729, + "mlir::mhlo::LowerComplexExtraPass": 0.00024300000222865492, + "mlir::mhlo::LowerComplexPass": 0.0003090000245720148, + "native-to-custom-softmax": 0.00030399998649954796, + "native-to-custom-softmax-dx": 0.0016090000281110406, + "neuron-hlo-verifier": 0.010127999819815159, + "operand_upcaster": 4.199999966658652e-05, + "opt-barrier-removal": 0.00026199998683296144, + "post-par-pipe-begin": 0.00030399998649954796, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0014479999663308263, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.05613299831748009, + "replace-minimum-constant": 0.00029700002050958574, + "reshape-mover": 5.60000044060871e-05, + "simplify-concat": 0.00010799999290611595, + "simplify-while-loops": 5.0000002374872565e-05, + "transform-variadic-reduce": 6.299999949987978e-05, + "tuple-simplifier": 0.00014699998428113759, + "unpack-nested-aws-ntwsr": 0.00021999998716637492, + "unroll-while-loop": 7.000000096013537e-06, + "zero_sized_hlo_elimination": 0.0007450000266544521 + }, + "hilo": { + "ConstantSize": 467583.0, + "HloInputCount": 371.0, + "HloMacCount": 13175750656.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910914048.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 871990400.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 20919.0, + "StaticProfiler::AifUb": 147.03309631347656, + "StaticProfiler::ArithmeticIntensityTensorizer": 141.05162048339844, + "StaticProfiler::AverageDmaLength": 2425.82958984375, + "StaticProfiler::DDRTransferBytes": 365941792.0, + "StaticProfiler::InternalTransferBytes": 325506848.0, + "StaticProfiler::LoadExpanded": 84060.0, + "StaticProfiler::StoreExpanded": 1898.0, + "StaticProfiler::TotalDMAExpanded": 85958.0, + "StaticProfiler::TotalDynamicInstancesCount": 25383.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24932.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 10464.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 10195.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 690.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 92.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.0015739999944344163, + "call-inliner": 0.00019500000053085387, + "collective-stream-id-checker": 5.400000009103678e-05, + "comparison-expander": 0.0004710000066552311, + "constant-statistics": 0.0004199999966658652, + "constant_folding": 0.0001320000010309741, + "dce": 3.7999998312443495e-05, + "dot_decomposer": 0.0009689999860711396, + "eliminate-redundant-compare": 0.00011899999663000926, + "flatten-call-graph": 0.0005910000181756914, + "hlo-mac-count": 0.006432000081986189, + "instruction-histogram": 0.0004990000161342323, + "io-con-pipe-begin": 3.999999989900971e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0007779999868944287, + "io-statistics": 4.099999932805076e-05, + "map-inline": 0.0007220000261440873, + "native-to-custom-softmax": 0.00028899998869746923, + "native-to-custom-softmax-dx": 0.00046099998871795833, + "neuron-hlo-verifier": 0.0090549997985363, + "opt-barrier-removal": 0.00026199998683296144, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.05613299831748009, + "replace-minimum-constant": 0.0002770000137388706, + "reshape-mover": 4.70000013592653e-05, + "simplify-while-loops": 4.3000000005122274e-05, + "tuple-simplifier": 0.00013299999409355223, + "unpack-nested-aws-ntwsr": 0.00020799999765586108, + "unroll-while-loop": 7.000000096013537e-06, + "zero_sized_hlo_elimination": 0.0007450000266544521 + } + }, + "attention_isa_kernel": { + "compiletime": { + "CoalesceCCOp": 0.00019693374633789063, + "DMALocalityOpt": 0.00016736984252929688, + "DMAProfiler": 0.00026297569274902344, + "DataStreaming": 0.0002357959747314453, + "DoNothing": 0.004472255706787109, + "ExpandISAMacro": 0.00024008750915527344, + "FactorizeBlkDims": 0.001956939697265625, + "InferPSumTensor": 0.0005483627319335938, + "InferSharedMemLoc": 0.0012214183807373047, + "InsertCoreBarrier": 0.000339508056640625, + "LateLegalizeInst": 0.00020360946655273438, + "LateNeuronInstComb": 0.002096414566040039, + "LegalizeSundaAccess": 0.00022792816162109375, + "LegalizeType": 0.00030231475830078125, + "LowerBroadcast": 0.0002613067626953125, + "LowerIntrinsics": 0.0003268718719482422, + "LowerTranspose": 0.0002701282501220703, + "NeuronInstComb": 0.000457763671875, + "NeuronLICM": 0.0002644062042236328, + "NeuronSimplifyPredicates": 0.0002472400665283203, + "NeuronValueNumbering": 0.00029158592224121094, + "SFKVectorizer": 0.002269744873046875, + "SimpleAllReduceTiling": 0.00020956993103027344, + "SimplifyNeuronTensor": 0.0006353855133056641, + "SpillPSum": 0.0006325244903564453, + "WeightCoalescing": 0.00021409988403320313 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00030303001403808594, + "DMALocalityOpt": 0.00025963783264160156, + "DMAProfiler": 0.0011391639709472656, + "DataStreaming": 0.0004107952117919922, + "DoNothing": 0.00016951560974121094, + "ExpandISAMacro": 0.0008628368377685547, + "FactorizeBlkDims": 0.0031676292419433594, + "InferPSumTensor": 0.0011391639709472656, + "InferSharedMemLoc": 0.0004911422729492188, + "InsertCoreBarrier": 0.0014476776123046875, + "LateLegalizeInst": 0.0051555633544921875, + "LateNeuronInstComb": 0.0011050701141357422, + "LegalizeSundaAccess": 0.0025599002838134766, + "LegalizeType": 0.0004215240478515625, + "LowerBroadcast": 0.0014843940734863281, + "LowerIntrinsics": 0.0016138553619384766, + "LowerTranspose": 0.00037097930908203125, + "NeuronInstComb": 0.0021207332611083984, + "NeuronLICM": 0.0007026195526123047, + "NeuronSimplifyPredicates": 0.004625082015991211, + "NeuronValueNumbering": 0.0007369518280029297, + "SFKVectorizer": 0.005678415298461914, + "SimpleAllReduceTiling": 0.0004096031188964844, + "SimplifyNeuronTensor": 0.0030858516693115234, + "SpillPSum": 0.0021026134490966797, + "WeightCoalescing": 0.0003502368927001953 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 2.499999936844688e-05, + "CanonicalizeForTensorizer": 1.2999999853491317e-05, + "Canonicalizer": 0.00033400001120753586, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 1.4999999621068127e-05, + "MemcastMotion": 1.1000000085914508e-05, + "PenguinizeFunctions": 1.4999999621068127e-05, + "PruneFunctions": 1.4000000192027073e-05, + "RemoveOptimizationBarriers": 2.099999983329326e-05, + "ScatterMotion": 2.9999999242136255e-05, + "TensorizerLegalizationPass": 1.8999999156221747e-05, + "VerifySupportedOps": 1.1000000085914508e-05, + "algsimp": 5.8000001445179805e-05, + "batchnorm_expander": 1.1000000085914508e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.999999979801942e-06, + "canonicalize-boundary-marker": 1.2000000424450263e-05, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.8000000636675395e-05, + "config-lowering": 2.9999999242136255e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.4000000192027073e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 4.999999873689376e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.4000000192027073e-05, + "flatten-call-graph": 9.000000318337698e-06, + "fuse-send-recv": 2.099999983329326e-05, + "hilo-conditional-to-select": 3.000000106112566e-06, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 5.700000110664405e-05, + "hilo::NeuronOpFusion": 4.999999873689376e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.1000000085914508e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 9.999999747378752e-06, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 7.899999764049426e-05, + "legalize-ccops-for-tensorizer": 1.9999999949504854e-06, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 1.4999999621068127e-05, + "mlir::detail::OpToOpPassAdaptor": 2.2000000171829015e-05, + "mlir::hlo::MhloToPyPenguin": 0.0009059999720193446, + "mlir::mhlo::LowerComplexExtraPass": 9.600000339560211e-05, + "mlir::mhlo::LowerComplexPass": 0.00018000000272877514, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 0.0011220000451430678, + "neuron-hlo-verifier": 0.00035700001171790063, + "operand_upcaster": 1.8000000636675395e-05, + "post-par-pipe-begin": 0.0003020000003743917, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005360000068321824, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 3.400000059627928e-05, + "simplify-while-loops": 3.000000106112566e-06, + "transform-variadic-reduce": 7.999999979801942e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 8.479304313659668, + "ConstantSize": 467583.0, + "HloInputCount": 371.0, + "HloMacCount": 1677721600.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910914048.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 395721504.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.05208444595336914, + "AffinePredicateResolution": 0.002167940139770508, + "AliasDependencyElimination": 0.00020503997802734375, + "AliasDependencyInduction": 0.006783246994018555, + "AliasDependencyReset": 0.20125508308410645, + "BFComputeCutting": 0.007937908172607422, + "BirCodeGenLoop": 0.10184049606323242, + "CCOpFusion": 0.03359842300415039, + "CanonicalizeDAGForPGTiling": 0.003628253936767578, + "CanonicalizeIR": 0.0030901432037353516, + "CoalesceCCOp": 0.017004013061523438, + "CommuteConcat": 0.0019147396087646484, + "DMALocalityOpt": 0.008169889450073242, + "DMAProfiler": 0.019730091094970703, + "DMATilingProfiler": 0.01212453842163086, + "DataLocalityOpt": 0.20879435539245605, + "DataStreaming": 0.017726421356201172, + "DeConcat": 0.0039784908294677734, + "DeadCodeElimination": 0.0020265579223632813, + "DeadStoreElimination": 0.023813247680664063, + "DelinearIndices": 0.020769357681274414, + "Delinearization": 0.03343391418457031, + "DelinearizeSPMD": 0.0467836856842041, + "DoNothing": 8.96453857421875e-05, + "DramToDramTranspose": 0.029311418533325195, + "DumpGraphAndMetadata": 0.008599281311035156, + "EliminateDivs": 0.003629446029663086, + "ExpandBatchNorm": 0.0015780925750732422, + "ExpandISAMacro": 0.006983280181884766, + "FactorizeBlkDims": 0.02126312255859375, + "FactorizeThreadAxesInFreeDims": 0.003243684768676758, + "FlattenMacroLoop": 0.0065686702728271484, + "GenericAccessSimplifier": 0.001466512680053711, + "InferInitValue": 0.04482269287109375, + "InferIntrinsicOnCC": 0.01812601089477539, + "InferNeuronTensor": 0.10232234001159668, + "InferNonlocalTensors": 0.17829585075378418, + "InferPSumTensor": 0.08844804763793945, + "InferShardAxis": 0.7131092548370361, + "InferSharedMemLoc": 0.007193565368652344, + "InlineNativeKernels": 0.006009101867675781, + "InsertCoreBarrier": 0.015059709548950195, + "InsertIOTransposes": 0.07647299766540527, + "InsertImplicitShardAxisBeforeISel": 0.020087480545043945, + "InsertLocalTransposes": 0.037857770919799805, + "InsertOffloadedTransposes": 0.022881269454956055, + "LICM": 0.012552261352539063, + "LateLegalizeInst": 0.025588512420654297, + "LateLegalizePostSplit": 0.012372970581054688, + "LateLowerReshapeOp": 0.004400491714477539, + "LateLowerTensorOp": 0.004253387451171875, + "LateNeuronInstComb": 0.039977073669433594, + "LayoutPreprocessing": 0.06799173355102539, + "LayoutPreprocessingAndAnalysis": 0.1176137924194336, + "LayoutRequirementAnalysis": 0.01578998565673828, + "LegalizeCCOpLayout": 0.0030679702758789063, + "LegalizeOpLevelAlias": 0.0017116069793701172, + "LegalizePartitionReduce": 0.002843618392944336, + "LegalizeSundaAccess": 0.08243513107299805, + "LegalizeSundaMacro": 0.02523207664489746, + "LegalizeType": 0.014882326126098633, + "LocalLayoutOpt": 0.019226789474487305, + "LoopFusion": 0.007382631301879883, + "LoopSplitting": 0.0006470680236816406, + "LowerBroadcast": 0.005588054656982422, + "LowerCCOpBlockAxis": 0.0077972412109375, + "LowerComplexBroadcast": 0.005771636962890625, + "LowerIntrinsics": 0.06823062896728516, + "LowerShardAxis": 0.01669931411743164, + "LowerTensorOp": 0.028963327407836914, + "LowerToSendRecv": 0.003696441650390625, + "LowerTranspose": 0.022225618362426758, + "MacroGeneration": 0.0702672004699707, + "MaskPropagation": 0.010986804962158203, + "MemcpyElimination": 0.1031653881072998, + "MutateDataType": 0.0030710697174072266, + "NeuronAliasDependencyInduction": 0.0008504390716552734, + "NeuronAliasDependencyReset": 0.10823488235473633, + "NeuronInstComb": 0.032953739166259766, + "NeuronLICM": 0.018877506256103516, + "NeuronLoopFusion": 0.03511810302734375, + "NeuronLoopInterchange": 0.009130239486694336, + "NeuronSimplifier": 0.02072596549987793, + "NeuronSimplifyPredicates": 0.005728721618652344, + "NeuronValueNumbering": 0.017284870147705078, + "OptimizeAliasedCopyChain": 0.0006775856018066406, + "OptimizeNKIKernels": 0.5134098529815674, + "PAGLayoutOpt": 0.5583286285400391, + "PComputeCutting": 0.026990413665771484, + "PGLayoutTilingPipeline": 2.505728006362915, + "PGTiling": 0.4031352996826172, + "PadElimination": 0.0005686283111572266, + "ParAxesAnnotation": 0.48941731452941895, + "PartialLoopFusion": 0.03877878189086914, + "PartialSimdFusion": 0.05450034141540527, + "PerfectLoopNest": 0.006276607513427734, + "RecognizeOpIdiom": 0.006324291229248047, + "Recompute": 0.0004134178161621094, + "RelaxPredicates": 0.008553743362426758, + "Rematerialization": 0.012713193893432617, + "RemoveShardedPartitionAxes": 0.04062914848327637, + "ReshapeWeights": 0.0019867420196533203, + "ResolveAccessConflict": 0.006893634796142578, + "ResolveComplicatePredicates": 0.0020072460174560547, + "RewriteReplicationMatmul": 0.002567291259765625, + "RewriteWeights": 0.008040666580200195, + "SFKVectorizer": 0.35219240188598633, + "ShardingPropagationAnalysis": 0.03732752799987793, + "SimpleAllReduceTiling": 0.00998234748840332, + "Simplifier": 0.00720524787902832, + "SimplifyMacroPredicates": 0.008156061172485352, + "SimplifyNeuronTensor": 0.020155906677246094, + "SimplifySlice": 0.0016894340515136719, + "SimplifyTensor": 0.01220250129699707, + "SpillPSum": 0.03788638114929199, + "SplitAPUnionSets": 0.05510139465332031, + "SplitAccGrp": 0.006468534469604492, + "StaticProfiler": 0.017852783203125, + "StaticTransposeLocalTensor": 0.00736546516418457, + "SundaISel": 0.09026622772216797, + "TCTransform": 0.0017704963684082031, + "TensorInitialization": 0.010450363159179688, + "TensorOpSimplifier": 0.02020740509033203, + "TensorOpTransform": 0.027513504028320313, + "TileCCOps": 0.008568286895751953, + "TilingProfiler": 0.03838157653808594, + "TransformConvOp": 0.007506370544433594, + "TritiumFusion": 0.050549983978271484, + "ValueNumbering": 0.0038373470306396484, + "VectorizeDMA": 0.017205238342285156, + "VectorizeMatMult": 0.021669626235961914, + "WeightCoalescing": 0.004259347915649414, + "ZeroSizeTensorElimination": 0.00019121170043945313 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 847.0, + "StaticProfiler::AifUb": 8.478300094604492, + "StaticProfiler::ArithmeticIntensityTensorizer": 131.77493286132813, + "StaticProfiler::AverageDmaLength": 1355.7093505859375, + "StaticProfiler::AverageFractalPeUtilization": 99.68699645996094, + "StaticProfiler::AveragePartitionUtilization": 99.0614013671875, + "StaticProfiler::AveragePeUtilization": 99.3685073852539, + "StaticProfiler::DDRTransferBytes": 29617926.0, + "StaticProfiler::InternalTransferBytes": 11470848.0, + "StaticProfiler::LoadExpanded": 12422.0, + "StaticProfiler::LocalizationEfficiency": 1554.2613525390625, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1973.466064453125, + "StaticProfiler::StoreExpanded": 5889.0, + "StaticProfiler::TotalDMAExpanded": 18311.0, + "StaticProfiler::TotalDynamicInstancesCount": 1115.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1113.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 20.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 514.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 2.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 104.0, + "TilingProfiler::PfTransposeInstructionsForIo": 32.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 24.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 48.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 86.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.10172820091247559, + "AffinePredicateResolution": 0.0019948482513427734, + "AliasDependencyElimination": 0.0002758502960205078, + "AliasDependencyInduction": 0.007086038589477539, + "AliasDependencyReset": 0.13438987731933594, + "BFComputeCutting": 0.0027761459350585938, + "BirCodeGenLoop": 0.05368757247924805, + "CCOpFusion": 0.03205680847167969, + "CanonicalizeDAGForPGTiling": 0.0052297115325927734, + "CanonicalizeIR": 0.002682924270629883, + "CoalesceCCOp": 0.008353471755981445, + "CommuteConcat": 0.0031654834747314453, + "DMALocalityOpt": 0.0032248497009277344, + "DMAProfiler": 0.006761789321899414, + "DMATilingProfiler": 0.00853872299194336, + "DataLocalityOpt": 0.366649866104126, + "DataStreaming": 0.008889198303222656, + "DeConcat": 0.002901792526245117, + "DeadCodeElimination": 0.016579151153564453, + "DeadStoreElimination": 0.029788732528686523, + "DelinearIndices": 0.019867897033691406, + "Delinearization": 0.0065822601318359375, + "DelinearizeSPMD": 0.023911237716674805, + "DoNothing": 7.867813110351563e-05, + "DramToDramTranspose": 0.026773452758789063, + "DumpGraphAndMetadata": 0.006331682205200195, + "EliminateDivs": 0.006492137908935547, + "ExpandBatchNorm": 0.0019371509552001953, + "ExpandISAMacro": 0.011901378631591797, + "FactorizeBlkDims": 0.03787398338317871, + "FactorizeThreadAxesInFreeDims": 0.0023696422576904297, + "FlattenMacroLoop": 0.006732463836669922, + "GenericAccessSimplifier": 0.0011754035949707031, + "InferInitValue": 0.07735943794250488, + "InferIntrinsicOnCC": 0.017465829849243164, + "InferNeuronTensor": 0.09335732460021973, + "InferNonlocalTensors": 0.029421567916870117, + "InferPSumTensor": 0.12906312942504883, + "InferShardAxis": 0.7434248924255371, + "InferSharedMemLoc": 0.005700111389160156, + "InlineNativeKernels": 0.002834320068359375, + "InsertCoreBarrier": 0.006781339645385742, + "InsertIOTransposes": 0.0841522216796875, + "InsertImplicitShardAxisBeforeISel": 0.012434244155883789, + "InsertLocalTransposes": 0.019251346588134766, + "InsertOffloadedTransposes": 0.028300762176513672, + "LICM": 0.005795001983642578, + "LateLegalizeInst": 0.011514902114868164, + "LateLegalizePostSplit": 0.005158185958862305, + "LateLowerReshapeOp": 0.0047490596771240234, + "LateLowerTensorOp": 0.004218101501464844, + "LateNeuronInstComb": 0.047844648361206055, + "LayoutPreprocessing": 0.03463029861450195, + "LayoutPreprocessingAndAnalysis": 0.06621217727661133, + "LayoutRequirementAnalysis": 0.007728099822998047, + "LegalizeCCOpLayout": 0.003231048583984375, + "LegalizeOpLevelAlias": 0.001981973648071289, + "LegalizePartitionReduce": 0.0027234554290771484, + "LegalizeSundaAccess": 0.04511404037475586, + "LegalizeSundaMacro": 0.022600412368774414, + "LegalizeType": 0.0190885066986084, + "LocalLayoutOpt": 0.04217672348022461, + "LoopFusion": 0.012153148651123047, + "LoopSplitting": 0.0006983280181884766, + "LowerBroadcast": 0.001943826675415039, + "LowerCCOpBlockAxis": 0.007781505584716797, + "LowerComplexBroadcast": 0.004039287567138672, + "LowerIntrinsics": 0.08824563026428223, + "LowerShardAxis": 0.008327722549438477, + "LowerTensorOp": 0.033898115158081055, + "LowerToSendRecv": 0.005768775939941406, + "LowerTranspose": 0.02297377586364746, + "MacroGeneration": 0.16904258728027344, + "MaskPropagation": 0.007157087326049805, + "MemcpyElimination": 0.08653593063354492, + "MutateDataType": 0.001874685287475586, + "NeuronAliasDependencyInduction": 0.0008199214935302734, + "NeuronAliasDependencyReset": 0.09268832206726074, + "NeuronInstComb": 0.013442754745483398, + "NeuronLICM": 0.04093337059020996, + "NeuronLoopFusion": 0.07855010032653809, + "NeuronLoopInterchange": 0.0029878616333007813, + "NeuronSimplifier": 0.013553857803344727, + "NeuronSimplifyPredicates": 0.0043621063232421875, + "NeuronValueNumbering": 0.011638164520263672, + "OptimizeAliasedCopyChain": 0.001085042953491211, + "OptimizeNKIKernels": 0.4002358913421631, + "PAGLayoutOpt": 0.5899946689605713, + "PComputeCutting": 0.011747598648071289, + "PGLayoutTilingPipeline": 2.3099381923675537, + "PGTiling": 0.39591336250305176, + "PadElimination": 0.0018284320831298828, + "ParAxesAnnotation": 0.5343668460845947, + "PartialLoopFusion": 0.0648810863494873, + "PartialSimdFusion": 0.06934404373168945, + "PerfectLoopNest": 0.010063648223876953, + "RecognizeOpIdiom": 0.006760358810424805, + "Recompute": 0.0004215240478515625, + "RelaxPredicates": 0.004682064056396484, + "Rematerialization": 0.0020973682403564453, + "RemoveShardedPartitionAxes": 0.03322100639343262, + "ReshapeWeights": 0.005750894546508789, + "ResolveAccessConflict": 0.005618572235107422, + "ResolveComplicatePredicates": 0.0011665821075439453, + "RewriteReplicationMatmul": 0.0025589466094970703, + "RewriteWeights": 0.010002374649047852, + "SFKVectorizer": 0.2708115577697754, + "ShardingPropagationAnalysis": 0.04528522491455078, + "SimpleAllReduceTiling": 0.003036975860595703, + "Simplifier": 0.004547834396362305, + "SimplifyMacroPredicates": 0.0300595760345459, + "SimplifyNeuronTensor": 0.014966249465942383, + "SimplifySlice": 0.01027679443359375, + "SimplifyTensor": 0.020308732986450195, + "SpillPSum": 0.04539823532104492, + "SplitAPUnionSets": 0.023496150970458984, + "SplitAccGrp": 0.0026144981384277344, + "StaticProfiler": 0.006074190139770508, + "StaticTransposeLocalTensor": 0.006592273712158203, + "SundaISel": 0.06954693794250488, + "TCTransform": 0.001828908920288086, + "TensorInitialization": 0.00876927375793457, + "TensorOpSimplifier": 0.011527299880981445, + "TensorOpTransform": 0.03972220420837402, + "TileCCOps": 0.00546574592590332, + "TilingProfiler": 0.02742171287536621, + "TransformConvOp": 0.006824016571044922, + "TritiumFusion": 0.11011958122253418, + "ValueNumbering": 0.004981040954589844, + "VectorizeDMA": 0.03582024574279785, + "VectorizeMatMult": 0.0291445255279541, + "WeightCoalescing": 0.008509397506713867, + "ZeroSizeTensorElimination": 0.00014853477478027344 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 1813.0, + "StaticProfiler::AifUb": 76.42292022705078, + "StaticProfiler::ArithmeticIntensityTensorizer": 227.36143493652344, + "StaticProfiler::AverageDmaLength": 4034.3251953125, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.65364074707031, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 63514120.0, + "StaticProfiler::InternalTransferBytes": 13500416.0, + "StaticProfiler::LoadExpanded": 10497.0, + "StaticProfiler::LocalizationEfficiency": 297.5042419433594, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 352.84381103515625, + "StaticProfiler::StoreExpanded": 2561.0, + "StaticProfiler::TotalDMAExpanded": 13058.0, + "StaticProfiler::TotalDynamicInstancesCount": 2025.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2025.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 16.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 1280.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 2.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 116.0, + "TilingProfiler::PfTransposeInstructionsForIo": 36.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 16.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 64.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 113.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.08984947204589844, + "AffinePredicateResolution": 0.0009312629699707031, + "AliasDependencyElimination": 0.00024366378784179688, + "AliasDependencyInduction": 0.005263328552246094, + "AliasDependencyReset": 0.04176759719848633, + "BFComputeCutting": 0.002216339111328125, + "BirCodeGenLoop": 0.3660314083099365, + "CCOpFusion": 0.04759931564331055, + "CanonicalizeDAGForPGTiling": 0.006819009780883789, + "CanonicalizeIR": 0.0015099048614501953, + "CoalesceCCOp": 0.007388591766357422, + "CommuteConcat": 0.0021598339080810547, + "DMALocalityOpt": 0.002432584762573242, + "DMAProfiler": 0.022784948348999023, + "DMATilingProfiler": 0.007287263870239258, + "DataLocalityOpt": 0.15184760093688965, + "DataStreaming": 0.007554292678833008, + "DeConcat": 0.0052378177642822266, + "DeadCodeElimination": 0.0020182132720947266, + "DeadStoreElimination": 0.007268428802490234, + "DelinearIndices": 0.006491422653198242, + "Delinearization": 0.00418853759765625, + "DelinearizeSPMD": 0.03150320053100586, + "DoNothing": 8.726119995117188e-05, + "DramToDramTranspose": 0.028717756271362305, + "DumpGraphAndMetadata": 0.04632568359375, + "EliminateDivs": 0.0021729469299316406, + "ExpandBatchNorm": 0.0017549991607666016, + "ExpandISAMacro": 0.0053784847259521484, + "FactorizeBlkDims": 0.046364784240722656, + "FactorizeThreadAxesInFreeDims": 0.0036237239837646484, + "FlattenMacroLoop": 0.012475728988647461, + "GenericAccessSimplifier": 0.0007128715515136719, + "InferInitValue": 0.11746096611022949, + "InferIntrinsicOnCC": 0.008626222610473633, + "InferNeuronTensor": 0.17520785331726074, + "InferNonlocalTensors": 0.02865004539489746, + "InferPSumTensor": 0.07464981079101563, + "InferShardAxis": 0.2832298278808594, + "InferSharedMemLoc": 0.01778268814086914, + "InlineNativeKernels": 0.0025413036346435547, + "InsertCoreBarrier": 0.007167816162109375, + "InsertIOTransposes": 0.058136701583862305, + "InsertImplicitShardAxisBeforeISel": 0.024377822875976563, + "InsertLocalTransposes": 0.016265153884887695, + "InsertOffloadedTransposes": 0.03376030921936035, + "LICM": 0.015621185302734375, + "LateLegalizeInst": 0.018033266067504883, + "LateLegalizePostSplit": 0.01734447479248047, + "LateLowerReshapeOp": 0.0016047954559326172, + "LateLowerTensorOp": 0.0011878013610839844, + "LateNeuronInstComb": 0.05313730239868164, + "LayoutPreprocessing": 0.05620622634887695, + "LayoutPreprocessingAndAnalysis": 0.18100428581237793, + "LayoutRequirementAnalysis": 0.014584064483642578, + "LegalizeCCOpLayout": 0.0032541751861572266, + "LegalizeOpLevelAlias": 0.0010030269622802734, + "LegalizePartitionReduce": 0.002452373504638672, + "LegalizeSundaAccess": 0.040776968002319336, + "LegalizeSundaMacro": 0.0427708625793457, + "LegalizeType": 0.016519784927368164, + "LocalLayoutOpt": 0.014898538589477539, + "LoopFusion": 0.005176067352294922, + "LoopSplitting": 0.00048732757568359375, + "LowerBroadcast": 0.004655599594116211, + "LowerCCOpBlockAxis": 0.004888296127319336, + "LowerComplexBroadcast": 0.010831594467163086, + "LowerIntrinsics": 0.03900289535522461, + "LowerShardAxis": 0.017355918884277344, + "LowerTensorOp": 0.013428449630737305, + "LowerToSendRecv": 0.038613319396972656, + "LowerTranspose": 0.050206661224365234, + "MacroGeneration": 0.1058506965637207, + "MaskPropagation": 0.004538536071777344, + "MemcpyElimination": 0.04629826545715332, + "MutateDataType": 0.0012559890747070313, + "NeuronAliasDependencyInduction": 0.0006165504455566406, + "NeuronAliasDependencyReset": 0.03877615928649902, + "NeuronInstComb": 0.02690267562866211, + "NeuronLICM": 0.024822473526000977, + "NeuronLoopFusion": 0.08438324928283691, + "NeuronLoopInterchange": 0.0028100013732910156, + "NeuronSimplifier": 0.0370326042175293, + "NeuronSimplifyPredicates": 0.017668962478637695, + "NeuronValueNumbering": 0.006052970886230469, + "OptimizeAliasedCopyChain": 0.0005040168762207031, + "OptimizeNKIKernels": 4.637849807739258, + "PAGLayoutOpt": 0.15427088737487793, + "PComputeCutting": 0.022019147872924805, + "PGLayoutTilingPipeline": 1.5585658550262451, + "PGTiling": 0.3059046268463135, + "PadElimination": 0.00058746337890625, + "ParAxesAnnotation": 0.07737350463867188, + "PartialLoopFusion": 0.03046131134033203, + "PartialSimdFusion": 0.008630514144897461, + "PerfectLoopNest": 0.0037374496459960938, + "RecognizeOpIdiom": 0.0049936771392822266, + "Recompute": 0.0004494190216064453, + "RelaxPredicates": 0.00769495964050293, + "Rematerialization": 0.0034401416778564453, + "RemoveShardedPartitionAxes": 0.008293628692626953, + "ReshapeWeights": 0.004475116729736328, + "ResolveAccessConflict": 0.0053598880767822266, + "ResolveComplicatePredicates": 0.0009164810180664063, + "RewriteReplicationMatmul": 0.00577545166015625, + "RewriteWeights": 0.010277271270751953, + "SFKVectorizer": 0.19967889785766602, + "ShardingPropagationAnalysis": 0.06793785095214844, + "SimpleAllReduceTiling": 0.004133701324462891, + "Simplifier": 0.0029976367950439453, + "SimplifyMacroPredicates": 0.025454998016357422, + "SimplifyNeuronTensor": 0.029609203338623047, + "SimplifySlice": 0.0008246898651123047, + "SimplifyTensor": 0.03260469436645508, + "SpillPSum": 0.01929450035095215, + "SplitAPUnionSets": 0.08632850646972656, + "SplitAccGrp": 0.002518892288208008, + "StaticProfiler": 0.026699542999267578, + "StaticTransposeLocalTensor": 0.009710550308227539, + "SundaISel": 0.08615612983703613, + "TCTransform": 0.0014863014221191406, + "TensorInitialization": 0.017354965209960938, + "TensorOpSimplifier": 0.004897356033325195, + "TensorOpTransform": 0.026237010955810547, + "TileCCOps": 0.007733821868896484, + "TilingProfiler": 0.03455352783203125, + "TransformConvOp": 0.0042724609375, + "TritiumFusion": 0.11825895309448242, + "ValueNumbering": 0.0019876956939697266, + "VectorizeDMA": 0.03213214874267578, + "VectorizeMatMult": 0.010382413864135742, + "WeightCoalescing": 0.003669261932373047, + "ZeroSizeTensorElimination": 0.00017881393432617188 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 20919.0, + "StaticProfiler::AifUb": 147.03309631347656, + "StaticProfiler::ArithmeticIntensityTensorizer": 141.05162048339844, + "StaticProfiler::AverageDmaLength": 2425.82958984375, + "StaticProfiler::AverageFractalPeUtilization": 98.71436309814453, + "StaticProfiler::AveragePartitionUtilization": 94.08551025390625, + "StaticProfiler::AveragePeUtilization": 96.60899353027344, + "StaticProfiler::DDRTransferBytes": 365941792.0, + "StaticProfiler::InternalTransferBytes": 325506848.0, + "StaticProfiler::LoadExpanded": 84060.0, + "StaticProfiler::LocalizationEfficiency": 95.931884765625, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 99.52960968017578, + "StaticProfiler::StoreExpanded": 1898.0, + "StaticProfiler::TotalDMAExpanded": 85958.0, + "StaticProfiler::TotalDynamicInstancesCount": 25383.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 24932.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 10464.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 10195.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 690.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 92.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 2.499999936844688e-05, + "CanonicalizeForTensorizer": 1.1000000085914508e-05, + "Canonicalizer": 0.00020599999697878957, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 1.2000000424450263e-05, + "MemcastMotion": 7.999999979801942e-06, + "PenguinizeFunctions": 9.999999747378752e-06, + "PruneFunctions": 1.4999999621068127e-05, + "RemoveOptimizationBarriers": 1.9999999494757503e-05, + "ScatterMotion": 1.9999999949504854e-06, + "TensorizerLegalizationPass": 1.4000000192027073e-05, + "VerifySupportedOps": 9.999999747378752e-06, + "algsimp": 4.999999873689376e-05, + "batchnorm_expander": 1.1000000085914508e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.999999979801942e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 3.999999989900971e-06, + "computation-deduplicator": 1.8000000636675395e-05, + "config-lowering": 3.400000059627928e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 9.999999747378752e-06, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 4.999999873689376e-06, + "eliminate-redundant-compare": 1.2999999853491317e-05, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 7.000000096013537e-06, + "fuse-send-recv": 1.8000000636675395e-05, + "hilo-conditional-to-select": 4.999999873689376e-06, + "hilo::LegalizeAlias": 3.999999989900971e-06, + "hilo::NeuronInstCombine": 5.6000000768108293e-05, + "hilo::NeuronOpFusion": 2.300000051036477e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.2000000424450263e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 7.999999979801942e-06, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 8.900000102585182e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 9.999999747378752e-06, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 1.8999999156221747e-05, + "mlir::hlo::MhloToPyPenguin": 0.0009159999899566174, + "mlir::mhlo::LowerComplexExtraPass": 6.900000153109431e-05, + "mlir::mhlo::LowerComplexPass": 0.00011800000356743112, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.2999999853491317e-05, + "neuron-hlo-verifier": 0.00035600000410340726, + "operand_upcaster": 1.2000000424450263e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0004619999963324517, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.199999966658652e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.000000096013537e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 105.0946273803711, + "HloMacCount": 6509559808.0, + "Traffic": 123879968.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.2999999853491317e-05, + "Canonicalizer": 0.0002699999895412475, + "HoistCompute": 4.999999873689376e-06, + "IdentifyCrossPassTensors": 1.2000000424450263e-05, + "MemcastMotion": 3.000000106112566e-06, + "PenguinizeFunctions": 1.2000000424450263e-05, + "PruneFunctions": 1.700000029813964e-05, + "RemoveOptimizationBarriers": 7.000000096013537e-06, + "ScatterMotion": 0.0, + "TensorizerLegalizationPass": 7.999999979801942e-06, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 4.8000001697801054e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.000000318337698e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 2.4000000848900527e-05, + "computation-deduplicator": 2.300000051036477e-05, + "config-lowering": 3.400000059627928e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 1.2999999853491317e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 4.999999873689376e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.2000000424450263e-05, + "flatten-call-graph": 1.1000000085914508e-05, + "fuse-send-recv": 1.8999999156221747e-05, + "hilo-conditional-to-select": 4.999999873689376e-06, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 6.399999983841553e-05, + "hilo::NeuronOpFusion": 9.999999747378752e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.8999999156221747e-05, + "hilo::ScheduleFusion": 1.9999999949504854e-06, + "hilo::SixtyFourHack": 4.400000034365803e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.004759000148624182, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.2999999853491317e-05, + "metadata-naming": 1.5999999959603883e-05, + "mlir::detail::OpToOpPassAdaptor": 2.4000000848900527e-05, + "mlir::hlo::MhloToPyPenguin": 0.005001000128686428, + "mlir::mhlo::LowerComplexExtraPass": 7.79999973019585e-05, + "mlir::mhlo::LowerComplexPass": 1.1000000085914508e-05, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.2999999853491317e-05, + "neuron-hlo-verifier": 0.0003600000054575503, + "operand_upcaster": 1.2000000424450263e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.00044999999227002263, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 3.199999991920777e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 4.8000001697801054e-05, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 4.999999873689376e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 28.312292098999023, + "HloMacCount": 4988469248.0, + "Traffic": 352388928.0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.006628990173339844, + "DMALocalityOpt": 0.003807544708251953, + "DMAProfiler": 0.007816553115844727, + "DataStreaming": 0.022742509841918945, + "DoNothing": 0.00023865699768066406, + "ExpandISAMacro": 0.0065212249755859375, + "FactorizeBlkDims": 0.026747465133666992, + "InferPSumTensor": 0.02189779281616211, + "InferSharedMemLoc": 0.0063364505767822266, + "InsertCoreBarrier": 0.006017923355102539, + "LateLegalizeInst": 0.014620304107666016, + "LateNeuronInstComb": 0.0202789306640625, + "LegalizeSundaAccess": 0.028186798095703125, + "LegalizeType": 0.019533634185791016, + "LowerBroadcast": 0.013374805450439453, + "LowerIntrinsics": 0.010933876037597656, + "LowerTranspose": 0.00744938850402832, + "NeuronInstComb": 0.02654409408569336, + "NeuronLICM": 0.021889686584472656, + "NeuronSimplifyPredicates": 0.006708860397338867, + "NeuronValueNumbering": 0.007520914077758789, + "SFKVectorizer": 0.06228280067443848, + "SimpleAllReduceTiling": 0.00653386116027832, + "SimplifyNeuronTensor": 0.09801602363586426, + "SpillPSum": 0.04999828338623047, + "WeightCoalescing": 0.0065784454345703125 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk1/graph.neff b/context_encoding_model/_tp0_bk1/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..69e2489e54b9eb80c754175cdcb9058f306219bb --- /dev/null +++ b/context_encoding_model/_tp0_bk1/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b96dea22dba97fdfefb2f26f7ad03c509af0a395c08e4bfb143ff14bd673c826 +size 1229824 diff --git a/context_encoding_model/_tp0_bk1/log-neuron-cc.txt b/context_encoding_model/_tp0_bk1/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..8397c0afffa81747eff5c3574c2015084bb25e6e --- /dev/null +++ b/context_encoding_model/_tp0_bk1/log-neuron-cc.txt @@ -0,0 +1,9566 @@ +2025-11-04T21:38:31Z INFO 8502 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:31Z INFO 8502 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:31Z INFO 8522 [root]: XLA detected +2025-11-04T21:38:31Z INFO 8522 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:31Z INFO 8522 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1 +2025-11-04T21:38:31Z INFO 8522 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:31Z INFO 8522 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:31Z INFO 8522 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:31Z INFO 8522 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:31Z INFO 8522 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:31Z INFO 8522 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:31Z INFO 8522 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:32Z INFO 8522 [job.HLOToTensorizer.0]: +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 8312 + reshape 1912 23.00% ################################################################ + broadcast 1123 13.51% ##################################### + transpose 1072 12.90% ################################### + convert 945 11.37% ############################### + constant 636 7.65% ##################### + parameter 371 4.46% ############ + slice 347 4.17% ########### + add 284 3.42% ######### + get-tuple-element 259 3.12% ######## + multiply 255 3.07% ######## + dot 198 2.38% ###### + call 174 2.09% ##### + compare 173 2.08% ##### + select 170 2.05% ##### + concatenate 116 1.40% ### + tuple 57 0.69% # + scatter 57 0.69% # + negate 56 0.67% # + all-reduce 56 0.67% # + divide 29 0.35% + gather 6 0.07% + iota 5 0.06% + all-gather 3 0.04% + reduce 3 0.04% + custom-call 2 0.02% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 5437 + reshape 1421 26.14% ################################################################ + transpose 817 15.03% #################################### + convert 720 13.24% ################################ + constant 443 8.15% ################### + parameter 371 6.82% ################ + broadcast 266 4.89% ########### + dot 197 3.62% ######## + custom-call 175 3.22% ####### + multiply 171 3.15% ####### + add 171 3.15% ####### + get-tuple-element 147 2.70% ###### + slice 115 2.12% ##### + concatenate 114 2.10% ##### + compare 59 1.09% ## + select 58 1.07% ## + scatter 57 1.05% ## + negate 56 1.03% ## + all-reduce 56 1.03% ## + gather 6 0.11% + all-gather 3 0.06% + iota 3 0.06% + reduce 3 0.06% + pad 2 0.04% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +Potential split-points stats: #CC 59 #AR 56 #AG 3 #BN 0 nClamp 0 +ModuleSplitter initial partitioning... #parts 59 +ModuleSplitter initial partitioning... Done. + 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 57 58 +New disjoint wave: start 2 len 54 NumReps: 27 macs 175758114816 +First non-zero-mac/used part from the end is 58 +Not enough zero-mac parts. skip +ModuleSplitter initial partitioning... #parts 29 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:32Z INFO 8522 [job.HLOToTensorizer.0]: IR signature: 6ff9c9cf82bf483a04e8e446a08497ebb81595120d9ffaf9ba37b3a8ca72a150 for sg0000/HLOToTensorizer +2025-11-04T21:38:32Z INFO 8522 [job.HLOToTensorizer.0]: IR signature: 8b71039a7fc20277fc92b979e2435112f2809401a94f563c433ff50eed25202e for sg0001/HLOToTensorizer +2025-11-04T21:38:32Z INFO 8522 [job.HLOToTensorizer.0]: IR signature: d0e626ffe73dc7d995c9055f78514514a3f2e330390cec2aa95adc4273e03f9a for sg0002/HLOToTensorizer +2025-11-04T21:38:32Z INFO 8522 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:32Z INFO 8522 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:32Z INFO 8522 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:32Z INFO 8522 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:32Z INFO 8522 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:32Z INFO 8522 [job.Frontend.0]: Start model loading +2025-11-04T21:38:32Z INFO 8522 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:32Z INFO 8522 [job.Frontend.0]: Num jobs: 12 +2025-11-04T21:38:32Z USER 8522 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:32Z INFO 8522 [Tensorizer]: Max workers: 3 +2025-11-04T21:38:32Z INFO 8588 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-11-04T21:38:32Z INFO 8589 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-11-04T21:38:32Z INFO 8590 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-11-04T21:38:32Z INFO 8588 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:32Z INFO 8588 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:32Z INFO 8588 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:32Z INFO 8589 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:32Z INFO 8589 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.008 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.007 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.029 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.034 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.016 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.039 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.022 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.058 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.012 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.020 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8590 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.004 seconds +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8590 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.003 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.016 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.005 seconds +2025-11-04T21:38:32Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:32Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.024 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.034 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.040 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.013 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.019 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.028 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.012 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.134 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.082 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.087 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.026 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.024 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.042 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.042 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.201 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.046 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.032 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.023 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.097 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.103 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.039 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.032 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.013 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.016 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.044 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.015 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.012 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.044 seconds +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8590 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:33Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.010 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.017 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.050 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.012 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [Tensorizer]: After optimization: 39 statements +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-162 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6459 | hlo_id: 108 | , id = 162 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-178 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6596 | hlo_id: 117 | , id = 178 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.014 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.024 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.030 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=1048576 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 256) %'all_gather.1' = AllGatherOp-34 AllGather_add(bfloat16 (1024, 256) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 256), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 15 | , id = 34 +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.024 seconds +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.015 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:34Z INFO 8590 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8589 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.056 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.020 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.074 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.018 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.015 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.181 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.029 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.019 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.011 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.077 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.014 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.017 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.068 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.016 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.154 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.016 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.118 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.032 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.017 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.068 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.042 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.017 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.178 seconds +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:35Z INFO 8588 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.035 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.066 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.029 seconds +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:35Z INFO 8589 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 36 +total number of sharded dags: 9 + +total bytes transferred from input, output, non local tensors: 354491170 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 350272272 +% bytes transferred with 2x bandwidths: 98.81 + +NC0 FLOPs: 4992055650 +NC1 FLOPs: 4989426784 +% FLOPs sharded: 99.97 + + +Shard dim: 256, Number of dags: 5 +Matmuls sharded with this dim: + + +Shard dim: 2, Number of dags: 3 +Matmuls sharded with this dim: +[256,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [256,8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[256,2,8,128] @ [2,8,128,2(s),6,2,128] = [256,2(s),6,2,128] Number of occurrences: 2 + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[2,8,128] @ [2,8,128,75968(s)] = [75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.283 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:35Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 600 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG75'), (10, 'AG77'), (20, 'AG76')] +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG81'), (0, 'AG78'), (17, 'AG80'), (19, 'AG79')] +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(22, 'AG71'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(3, 'AG85'), (21, 'AG84'), (16, 'AG73'), (18, 'AG72')] +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.090 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.022 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.000 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.489 seconds +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.038 seconds +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.558 seconds +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.106 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.306 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.033 seconds +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.047 seconds +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.534 seconds +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.037 seconds +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.590 seconds +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.024 seconds +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.058 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.045 seconds +2025-11-04T21:38:36Z INFO 8589 [sg0001/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.034 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.029 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.559 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 192: matmul_128x128x512 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 24: simd128x256 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 16: simd128x256 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.035 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.161 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.175 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 31 +total number of sharded dags: 24 + +total bytes transferred from input, output, non local tensors: 19538950 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 11149312 +% bytes transferred with 2x bandwidths: 57.06 + +NC0 FLOPs: 1613881347 +NC1 FLOPs: 1613881344 +% FLOPs sharded: 99.99 + + +Shard dim: 256, Number of dags: 22 +Matmuls sharded with this dim: +[256(s),2,8,128] @ [2,8,128,2,2,2,2,64] = [256(s),2,2,2,2,64] Number of occurrences: 1 +[256(s),2,8,128] @ [2,8,128,4,128] = [256(s),4,128] (stationary-streaming swapped) Number of occurrences: 1 +[256(s),2,8,128] @ [2,8,128,4,2,64] = [256(s),4,2,64] Number of occurrences: 1 +[64] @ [256(s)] = [64,256(s)] Number of occurrences: 1 + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 2, Number of dags: 1 +Matmuls sharded with this dim: +[256,4,2,128] @ [4,2,128,2(s),2,4,128] = [256,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.021 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.024 seconds +2025-11-04T21:38:36Z INFO 8590 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.021 seconds +2025-11-04T21:38:36Z INFO 8588 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.041 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.713 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 31 +total number of sharded dags: 21 + +total bytes transferred from input, output, non local tensors: 58991620 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 54657024 +% bytes transferred with 2x bandwidths: 92.65 + +NC0 FLOPs: 6449823747 +NC1 FLOPs: 6447202304 +% FLOPs sharded: 99.97 + + +Shard dim: 4, Number of dags: 12 +Matmuls sharded with this dim: +[256,2,8,128] @ [2,8,128,4(s),128] = [256,4(s),128] (stationary-streaming swapped) Number of occurrences: 1 +[256,2,8,128] @ [2,8,128,4(s),2,64] = [256,4(s),2,64] Number of occurrences: 1 + + +Shard dim: 2, Number of dags: 9 +Matmuls sharded with this dim: +[256,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [256,8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[256,2,8,128] @ [2,8,128,2(s),2,2,2,64] = [256,2(s),2,2,2,64] Number of occurrences: 1 +[256,2,8,128] @ [2,8,128,2(s),6,2,128] = [256,2(s),6,2,128] Number of occurrences: 2 +[256,4,2,128] @ [4,2,128,2(s),2,4,128] = [256,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.011 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.152 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 192: matmul_128x128x512 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: simd128x256 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: simd128x256 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: dma128x1024 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma1x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma1x128 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.033 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.743 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(28, 'AG84'), (20, 'AG86'), (23, 'AG85')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (28, 'AG84'), (23, 'AG85')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 635 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|N|(64, 2) is not sorted, index list (w/ AG ids): [(24, 'AG89'), (21, 'AG92')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 4, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (28, 'AG84'), (23, 'AG85')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|N|(64, 2) is not sorted, index list (w/ AG ids): [(24, 'AG89'), (17, 'AG96')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 4, 128) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (28, 'AG84'), (23, 'AG85')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 419 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(27, 'AG102'), (22, 'AG104'), (25, 'AG103')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 631 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (23, 'AG85'), (1, 'AG88'), (26, 'AG87')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 520 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate0(2, 128, 2, 8, 128) is not sorted, index list (w/ AG ids): [(1, 'AG88'), (26, 'AG87'), (20, 'AG86'), (23, 'AG85')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 582 of IO tensor non_local bfloat16 %reshape.16(2, 2, 2, 2, 64, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG95'), (12, 'AG94'), (16, 'AG93'), (21, 'AG92'), (24, 'AG89'), (1, 'AG88')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 676 of IO tensor non_local bfloat16 %reshape.24(4, 2, 2, 64, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG97'), (13, 'AG98'), (17, 'AG96'), (24, 'AG89'), (1, 'AG88')] +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 614 of IO tensor non_local bfloat16 %reshape.29(4, 2, 2, 128, 128) is not sorted, index list (w/ AG ids): [(9, 'AG100'), (14, 'AG101'), (1, 'AG88'), (18, 'AG99')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.052 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.043 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.027 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(30, 'AG104'), (23, 'AG106'), (27, 'AG105')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(30, 'AG104'), (23, 'AG106'), (27, 'AG105')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(30, 'AG104'), (23, 'AG106'), (27, 'AG105')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(31, 'AG108'), (5, 'AG110'), (28, 'AG109')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG114'), (0, 'AG111'), (18, 'AG113'), (24, 'AG112')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 659 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(30, 'AG104'), (23, 'AG106'), (27, 'AG105')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 660 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG106'), (30, 'AG104'), (27, 'AG105')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77|N|(64, 2) is not sorted, index list (w/ AG ids): [(25, 'AG115'), (19, 'AG116')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG106'), (30, 'AG104'), (27, 'AG105')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input75|N|(64, 2) is not sorted, index list (w/ AG ids): [(20, 'AG120'), (13, 'AG121')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 664 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG106'), (30, 'AG104'), (27, 'AG105')] +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(32, 'AG129'), (26, 'AG131'), (29, 'AG130')] +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.024 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.102 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.015 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.012 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.070 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.403 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.037 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.010 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.076 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.012 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.169 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.396 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.025 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.023 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.029 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.506 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.084 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 64: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x512 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 8: generic_store128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 8: generic_store128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 4: indirect_load128x512 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 4: rmsnorm128x512x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x256 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x256 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x512 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingBottleneck]: 4: simd128x128 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.038 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.117 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.028 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.026 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.032 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.037 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.027 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.310 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 192: matmul_128x128x512 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 64: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 64: matmul_128x128x256 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 64: matmul_128x128x512 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 24: simd128x256 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 16: simd128x256 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 8: generic_store128x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 8: generic_store128x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 4: rmsnorm128x512x128 +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingBottleneck]: 4: simd64x512 +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.033 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.027 seconds +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.093 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/LICM]: LICM finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8588 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8590 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:37Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.102 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.024 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.028 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/LICM]: LICM finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.086 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.085 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.039 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.093 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.011 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.026 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.031 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/LICM]: LICM finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.022 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.038 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.016 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.020 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.084 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.032 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.209 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: dma128x512 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: transpose_128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: dma128x256 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: generic_store128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: generic_store128x128 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: indirect_load128x512 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 4: dma128x4096 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.046 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.033 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.033 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.025 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.086 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.020 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.098 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.040 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.027 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.039 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.032 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.367 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 192: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 24: simd128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: simd128x256 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x256x128 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x128 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.017 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.023 seconds +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:38Z INFO 8589 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:38Z INFO 8590 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.045 seconds +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.021 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LICM]: LICM finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.052 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.053 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.118 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.090 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.015 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.035 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.108 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.030 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.030 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.050 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.029 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.077 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.035 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.044 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.020 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.015 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.053 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.021 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.019 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.044 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.039 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.061 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.070 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.033 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.025 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.038 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.034 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.093 seconds +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8589 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.075 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:39Z INFO 8588 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8590 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.041 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.055 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.051 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.017 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.079 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.018 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.036 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.030 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.022 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.038 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.038 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.039 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.056 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.072 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.057 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.022 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.030 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.053 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.040 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.200 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.036 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.018 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.038 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.064 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.068 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 76.191% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'933.1540'[i31_0,4i31_1_0_0+i31_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i31_0,i0.128+512i31_1_0_0+128i31_1_0_1,i2.16,i1.128] # id=1539, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_933 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 193.732us (300.000KiB, est bw: 1.586GB/s, 9.692% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 2, 37984) %'convert.55'[0,i31_0,i0.128+512i31_1_0_0+128i31_1_0_1] = store float32<1 x 128> TongaSB partitions[2] float32 (2, 297, 1, 128) %'950.1550'[i31_0,4i31_1_0_0+i31_1_0_1,0,i0.128] # id=1548, src_id=None, , instances=600 # dl = tensor_op_name: convert.55_pftranspose_950 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 2.943% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 12, 512) %'input365_local_1017'[i_shard_1213,i15_0_0_0_1,i15_0_0_0_0,i0.128,i3.12,i1.128+128i2.2+256p_1650] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 12, 2, 128) %'input365'[i15_0_0_0_1+2i15_0_0_0_0,p_1650,i_shard_1213,i0.128,i3.12,i2.2,i1.128] # id=1316, src_id=None, , instances=16 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.128, i2.2, i3.12]] -> [[i0.128];[i1.128, i2.2, i3.12]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 2.798% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input366_local_994'[i_shard_1213,i10_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input366'[i_shard_1213,i10_0_0_1,i0.128,i1.4096] # id=1307, src_id=None, , instances=12 # dl = tensor_op_name: _dot.197 | hlo_id: 52 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 2.798% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input368_local_1005'[i_shard_1213,i12_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input368'[i_shard_1213,i12_0_0_1,i0.128,i1.4096] # id=1310, src_id=None, , instances=12 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 0.520% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'937.1624'[i_shard_1213,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (256, 2048) %'add.9'[i0.128+128i2.2,i1.2048] # id=1510, src_id=None, , instances=2 # dl = tensor_op_name: add.9_pftranspose_937 | hlo_id: 27 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 0.520% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'941.1629'[i_shard_1221,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (524288,) %'all_reduce.3-buffer-2001'[2048i0.128+i1.2048+262144i2.2] # id=1521, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.3_pftranspose_941 | hlo_id: 66 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 6.737us (2.000MiB, est bw: 311.309GB/s, 0.337% of tot. time) for bfloat16<128 x 4096> non_local bfloat16 (524288,) %'dot.14-buffer-1999'[2048i0.128+i1.2048+262144i2.2] = store bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %1285[i_shard_1213,i0.128,i2.2,i1.2048] # id=1321, src_id=None, , instances=2 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 6.737us (2.000MiB, est bw: 311.309GB/s, 0.337% of tot. time) for bfloat16<128 x 4096> non_local bfloat16 (256, 16, 128) %'convert.53'[i0.128+128i4.2,i2.8+8i3.2,i1.128] = store bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2, 1024) %'945.1529'[i_shard_1221,i0.128,i4.2,i3.2,i1.128+128i2.8] # id=1527, src_id=None, , instances=2 # dl = tensor_op_name: convert.53_pftranspose_945 | hlo_id: 75 | [[i0.128];[i1.128, i2.8, i3.2, i4.2]] -> [[i0.128];[i1.128, i2.8, i3.2, i4.2]] +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 2.793us (256.000 B, est bw: 0.092GB/s, 0.140% of tot. time) for uint8<1 x 128> TongaSB partitions[1] uint8 (2, 1, 128) %'select.6538.1741'[i51_0_1116,0,i0.128] = load uint8<1 x 128> uint8 (2, 128) %'scatter.1'[i51_0_1116,i0.128] # id=1410, src_id=None, , instances=2 # dl = tensor_op_name: _select.6538 | hlo_id: 166 | [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.069 seconds +2025-11-04T21:38:40Z INFO 8589 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.019 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.023 seconds +2025-11-04T21:38:40Z INFO 8590 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:40Z INFO 8590 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:40Z INFO 8590 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8590 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:40Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:40Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.051 seconds +2025-11-04T21:38:40Z INFO 8588 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:40Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8590 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.036 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.088 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.110 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.035 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.004 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.031 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.037 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.028 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.029 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.071 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.082 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.064 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.025 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.065 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.017 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.020 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.037 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.058 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.048 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.020 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.045 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.018 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.035 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.088 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.036 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.019 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.049 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.056 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.041 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.063 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.066 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.106 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.065 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.014 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.016 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.129 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.014 seconds +2025-11-04T21:38:41Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.352 seconds +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8588 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.045 seconds +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:41Z INFO 8589 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.026 seconds +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.017 seconds +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.009 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.185 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.009 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 37.714us (8.000MiB, est bw: 222.428GB/s, 16.609% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 4096) %'input67_local_1548'[i2_0_1584,i35_0_0,c1_1541,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input67'[i35_0_0,c1_1541,i0.128,i1.4096] # id=1741, src_id=None, , instances=8 # dl = tensor_op_name: _dot.2 | hlo_id: 32 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 22.961us (1.000MiB, est bw: 45.668GB/s, 10.112% of tot. time) for bfloat16<128 x 128> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 4, 128) %'intermediate0_pftranspose_1465'[i0_0,i1_0_0,i1_0_1_0,i0.128,p_2156,i1.128] = load bfloat16<128 x 128> non_local bfloat16 (2, 2, 4, 128, 2, 128) %'all_gather.1'[i1_0_0,i1_0_1_0,p_2156,i0.128,i0_0,i1.128] # id=1700, src_id=None, , instances=32 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 22.961us (1.000MiB, est bw: 45.668GB/s, 10.112% of tot. time) for bfloat16<128 x 128> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 512) %'custom-call.177.2071'[i2_0_1584,i16_0_0_1530,i16_0_1_0_1530,i0.128,i1.128+128i16_0_1_1_1530] = load bfloat16<128 x 128> non_local bfloat16 (2, 2, 4, 128, 2, 128) %'all_gather.1'[i16_0_0_1530,i16_0_1_0_1530,i16_0_1_1_1530,i0.128,i2_0_1584,i1.128] # id=1736, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.177 | hlo_id: 24 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 8.591% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input62_local_1578'[i2_0_1584,c0_1572,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input62'[c0_1572,i0.128,i1.4096] # id=1835, src_id=None, , instances=4 # dl = tensor_op_name: _dot | hlo_id: 129 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 8.591% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input65_local_1563'[i64_0,c0_1557,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input65'[c0_1557,i0.128,i1.4096] # id=1788, src_id=None, , instances=4 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 8.591% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input61_local_1603'[i95_0_0_0,i95_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input61'[i95_0_0_0,i95_0_0_1,i0.128,i1.4096] # id=1840, src_id=None, , instances=4 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 7.534us (512.000KiB, est bw: 69.590GB/s, 3.318% of tot. time) for bfloat16<128 x 128> non_local bfloat16 (2, 2, 2, 128, 2, 128) %'reshape.16'[T_i41_0_0_1474_2622_2623,T_i41_0_1_1474_2620_2621_2623,T_i41_1_1474_2618_2619_2621_2623,i1.128,T_i40_0_1474,i0.128] = store bfloat16<128 x 128> TongaSB partitions[1] bfloat16 (2, 128, 1024) %'1470.2021'[T_i40_0_1474,i1.128,i0.128+512T_i41_0_0_1474_2622_2623+256T_i41_0_1_1474_2620_2621_2623+128T_i41_1_1474_2618_2619_2621_2623] # id=2019, src_id=None, , instances=16 # dl = tensor_op_name: reshape.16_pftranspose_1470 | hlo_id: 79 | [[i1.128];[i0.128]] -> [[i1.128];[i0.128]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 7.534us (512.000KiB, est bw: 69.590GB/s, 3.318% of tot. time) for bfloat16<128 x 128> non_local bfloat16 (2, 2, 2, 128, 2, 128) %'reshape.24'[T_i0_0_1479_2631_2632,T_i0_1_1479_2629_2630_2632,T_i1_1479_2627_2628_2630_2632,i1.128,T_i2_0_1479,i0.128] = store bfloat16<128 x 128> TongaSB partitions[1] bfloat16 (2, 128, 1024) %'1475.2031'[T_i2_0_1479,i1.128,i0.128+512T_i0_0_1479_2631_2632+256T_i0_1_1479_2629_2630_2632+128T_i1_1479_2627_2628_2630_2632] # id=2029, src_id=None, , instances=16 # dl = tensor_op_name: reshape.24_pftranspose_1475 | hlo_id: 121 | [[i1.128];[i0.128]] -> [[i1.128];[i0.128]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 6.372us (1.000MiB, est bw: 164.552GB/s, 2.806% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[1] bfloat16 (2, 128, 8, 2, 128) %'get_tuple_element.1_local_1592'[i95_0_0_0,i0.128,i3.8,i2.2,i1.128] = load bfloat16<128 x 2048> non_local bfloat16 (8, 128, 2, 128) %'get_tuple_element.1'[i3.8,i0.128,i2.2,i1.128] # id=1839, src_id=None, , instances=2 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.128, i2.2, i3.8]] -> [[i0.128];[i1.128, i2.2, i3.8]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 5.409us (512.000KiB, est bw: 96.926GB/s, 2.382% of tot. time) for bfloat16<128 x 512> TongaSB partitions[2] bfloat16 (2, 2, 128, 512) %'transpose.1_pftranspose_1460'[T_i2_0_1464,c0_1499_1881,i0.128,i1.512] = indirect_load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (151936, 2, 512) %'input60'[i0.128,T_i2_0_1464,i1.512] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[1] int32 (2, 128, 2, 1) %'gather.41.1879'[T_i2_0_1464,i0.128,c0_1499_1881,0] # id=1698, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=4 # dl = tensor_op_name: _gather.41 | hlo_id: 12 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.020 seconds +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.079 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.019 seconds +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8590 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.121 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.271 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 17.654% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 12, 512) %'input68_local_1337'[i_shard_1512,i15_0_0_0_1,i15_0_0_0_0,i0.128,i3.12,i1.128+128i2.2+256p_1832] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 12, 2, 128) %'input68'[i15_0_0_0_1+2i15_0_0_0_0,p_1832,i_shard_1512,i0.128,i3.12,i2.2,i1.128] # id=1572, src_id=None, , instances=16 # dl = tensor_op_name: _dot.6 | hlo_id: 51 | [[i0.128];[i1.128, i2.2, i3.12]] -> [[i0.128];[i1.128, i2.2, i3.12]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 16.781% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input69_local_1314'[i_shard_1512,i10_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input69'[i_shard_1512,i10_0_0_1,i0.128,i1.4096] # id=1563, src_id=None, , instances=12 # dl = tensor_op_name: _dot.4 | hlo_id: 40 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 55.921us (12.000MiB, est bw: 225.014GB/s, 16.781% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 6, 128, 4096) %'input71_local_1325'[i_shard_1512,i12_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input71'[i_shard_1512,i12_0_0_1,i0.128,i1.4096] # id=1566, src_id=None, , instances=12 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 5.854% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input78_local_1360'[i38_0_0,c1_1353,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input78'[i38_0_0,c1_1353,i0.128,i1.4096] # id=1588, src_id=None, , instances=4 # dl = tensor_op_name: _dot.9 | hlo_id: 71 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 5.854% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input72_local_1442'[i98_0_0_0,i98_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input72'[i98_0_0_0,i98_0_0_1,i0.128,i1.4096] # id=1639, src_id=None, , instances=4 # dl = tensor_op_name: _dot.10 | hlo_id: 173 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 11.445us (2.000MiB, est bw: 183.243GB/s, 3.434% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 2, 128, 8, 256) %'input76_local_1381'[c0_1374,c1_1375,i0.128,i2.8,i1.256] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 128, 8, 2, 256) %'input76'[c1_1375,i0.128,i2.8,c0_1374,i1.256] # id=1611, src_id=None, , instances=4 # dl = tensor_op_name: _dot.8 | hlo_id: 114 | [[i0.128];[i1.256, i2.8]] -> [[i0.128];[i1.256, i2.8]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 11.445us (2.000MiB, est bw: 183.243GB/s, 3.434% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 2, 128, 8, 256) %'input73_local_1417'[c0_1411,c1_1412,i0.128,i2.8,i1.256] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 128, 8, 2, 256) %'input73'[c1_1412,i0.128,i2.8,c0_1411,i1.256] # id=1634, src_id=None, , instances=4 # dl = tensor_op_name: _dot.7 | hlo_id: 155 | [[i0.128];[i1.256, i2.8]] -> [[i0.128];[i1.256, i2.8]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 3.122% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'1262.1796'[i_shard_1512,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (256, 2048) %'add.4'[i0.128+128i2.2,i1.2048] # id=1681, src_id=None, , instances=2 # dl = tensor_op_name: add.4_pftranspose_1262 | hlo_id: 15 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 3.122% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'1266.1801'[i38_0_0,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (524288,) %'all_reduce.1-buffer-2286'[2048i0.128+i1.2048+262144i2.2] # id=1692, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.1_pftranspose_1266 | hlo_id: 54 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 6.737us (2.000MiB, est bw: 311.309GB/s, 2.022% of tot. time) for bfloat16<128 x 4096> non_local bfloat16 (524288,) %'dot.7-buffer-2284'[2048i0.128+i1.2048+262144i2.2] = store bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %1549[i_shard_1512,i0.128,i2.2,i1.2048] # id=1577, src_id=None, , instances=2 # dl = tensor_op_name: _dot.6 | hlo_id: 51 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 17228) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8589 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.513 seconds +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.030 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:42Z INFO 8589 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8589 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 17228) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.400 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.011 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.010 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2630) %4(init=0.0)[i0.32,i1.2374] = load float32<32 x 2374> float32 (32, 2374) %6[i0.32,i1.2374] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2374) %10[i0.32,i1.2374] = load float32<32 x 2374> float32 (1, 75968) %'inp'[i0.32,i1.2374] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 9.509% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.044 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.023 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.032 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.061 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.027 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.034 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.019 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.055 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.031 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.025 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.027 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [Tensorizer]: BirCodeGen estimate #instances=1062 in sg0001 +2025-11-04T21:38:43Z INFO 8589 [Tensorizer]: IR signature: 4b0e19042ebc793009cb3bdb1c9953cc9bebd955f49273623907f11d79f456ed for nc00/sg0001/TensorizerBIR +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.055 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.017 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:43Z INFO 8589 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.054 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.017 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.050 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:43Z INFO 8589 [Tensorizer]: BirCodeGen estimate #instances=1062 in sg0001 +2025-11-04T21:38:43Z INFO 8589 [Tensorizer]: IR signature: ae1d504415c845080646624f9643a3d484997fe42961f59d94117b1122e48e2b for nc01/sg0001/TensorizerBIR +2025-11-04T21:38:43Z INFO 8589 [Tensorizer]: Weights total number of bytes: 139266 +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.033 seconds +2025-11-04T21:38:43Z INFO 8589 [Tensorizer]: Successfully built model. +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.011 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.034 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:43Z INFO 8588 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8590 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.022 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.022 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.061 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.028 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:44Z INFO 8588 [Tensorizer]: BirCodeGen estimate #instances=604 in sg0000 +2025-11-04T21:38:44Z INFO 8588 [Tensorizer]: IR signature: 4b22f1700e741dc1520c4efe01c7e440e37f2ed9be6fa5ce0f1b2935bae86015 for nc00/sg0000/TensorizerBIR +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.098 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8588 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.102 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.023 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:44Z INFO 8588 [Tensorizer]: BirCodeGen estimate #instances=604 in sg0000 +2025-11-04T21:38:44Z INFO 8588 [Tensorizer]: IR signature: 3060bec34cef28113034a12598cc8a2e6a6103fb353dadaa422fa1df08a46f0d for nc01/sg0000/TensorizerBIR +2025-11-04T21:38:44Z INFO 8588 [Tensorizer]: Weights total number of bytes: 205058 +2025-11-04T21:38:44Z INFO 8588 [Tensorizer]: Successfully built model. +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.062 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.015 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 272) %4(init=0.0)[i0.32,i1.16] = load float32<32 x 16> float32 (32, 16) %6[i0.32,i1.16] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 16) %10[i0.32,i1.16] = load float32<32 x 16> float32 (1, 512) %'inp'[i0.32,i1.16] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 12.028% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8590 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.004 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 4.638 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.088 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.088 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:45Z WARNING 8590 [sg0002/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 79.62 percent of all matmul computation +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.027 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.086 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.018 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8590 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.048 seconds +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.046 seconds +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.039 seconds +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.339 seconds +2025-11-04T21:38:46Z INFO 8590 [Tensorizer]: BirCodeGen estimate #instances=25032 in sg0002 +2025-11-04T21:38:46Z INFO 8590 [Tensorizer]: IR signature: a2ee20f5b98873a4081e9496a68cc970c471f6aaed5102593913624f7beac807 for nc00/sg0002/TensorizerBIR +2025-11-04T21:38:46Z INFO 8590 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:47Z INFO 8590 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8590 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.366 seconds +2025-11-04T21:38:47Z INFO 8590 [Tensorizer]: BirCodeGen estimate #instances=25032 in sg0002 +2025-11-04T21:38:47Z INFO 8590 [Tensorizer]: IR signature: 6f41dad01f8e98d30388522372925db3bc1a45ae4c1d1f75357b34166aaad5c7 for nc01/sg0002/TensorizerBIR +2025-11-04T21:38:47Z INFO 8590 [Tensorizer]: Weights total number of bytes: 410376 +2025-11-04T21:38:47Z INFO 8590 [Tensorizer]: Successfully built model. +2025-11-04T21:38:47Z USER 8522 [root/Tensorizer/Tensorizer]: Tensorizer finished after 14.801 seconds +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: End tensorization +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input60 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input0 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input63 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input67 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input66 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input65 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input64 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input62 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input61 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input4 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input5 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input70 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input71 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input69 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input68 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input74 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input78 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input77 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input76 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input75 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input73 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input72 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input6 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input7 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input367 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input368 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input366 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input365 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input370 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input369 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Network input: input3 +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:47Z INFO 8522 [job.Frontend.0]: Job #0 finished +2025-11-04T21:38:47Z INFO 8522 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:38:47Z INFO 8522 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:38:47Z INFO 8522 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:38:47Z INFO 8522 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: BackendDriver has 6 states with 2 core LNC +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: BackendDriver: found partitions within VNC, using VNC + MT modular flow. +2025-11-04T21:38:47Z INFO 8522 [job.BIRLinker.1]: Creating directory nc00/sgLnk/sg00 +2025-11-04T21:38:47Z INFO 8522 [job.BIRLinker.2]: Creating directory nc01/sgLnk/sg00 +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: BackendDriver in_state.num_states 6 with 2 core LNC +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs nc00/sg00,nc01/sg00,nc00/sg01,nc01/sg01,nc00/sg02,nc01/sg02 --link-dir sgLnk/sg00 --vnc-nc-per-sengine 2 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels io,scalar_dynamic_offset,vector_dynamic_offsets,spill_reload --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:38:47Z INFO 8522 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:38:47Z INFO 9029 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Loading module from nc01/sg01/bir.json +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Loading module from nc00/sg02/bir.json +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Loading module from nc01/sg02/bir.json +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Loading module from nc00/sg01/bir.json +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Backend driver mtBackend: true numModules: 6 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f" +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Modular flow call graph is enabled +2025-11-04T21:38:47Z INFO 9029 [BackendDriver]: Internal partitioner is enabled +2025-11-04T21:38:47Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:47Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1888 blocks=6 instructions=1974 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 173 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 173 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 149 memory location(s), 1 block(s), and 106 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.232.2291}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:47Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.232.2291}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 173 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 86mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 149 memory location(s), 1 block(s), and 106 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.008 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 88mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 173 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.012 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 93mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 149 memory location(s), 1 block(s), and 106 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.007 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 94mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 149 memory location(s), 1 block(s), and 106 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.089 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 156mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.110 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:47Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.112 seconds +2025-11-04T21:38:47Z INFO 9029 [BackendPassManager]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:47Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=1888 blocks=6 instructions=1974 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:47Z USER 9029 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:47Z USER 9029 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:47Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=298 blocks=2 instructions=212 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1176 blocks=2 instructions=1416 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:47Z USER 9029 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 298 memory location(s), 2 block(s), and 212 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=414 blocks=2 instructions=346 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1176 memory location(s), 2 block(s), and 1416 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 414 memory location(s), 2 block(s), and 346 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:47Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 9029 [BackendPassManager]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:47Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1888 blocks=6 instructions=1974 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 173 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 149 memory location(s), 1 block(s), and 106 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 161mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 149 memory location(s), 1 block(s), and 106 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=149 blocks=1 instructions=106 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 162mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 588 memory location(s), 1 block(s), and 708 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=588 blocks=1 instructions=708 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 167mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 173 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=207 blocks=1 instructions=173 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:47 2025 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:47 2025 + +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Total count: 1058 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Matmult: 783 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: TensorScalarPtr: 58 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: GenericCopy: 49 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Load: 45 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: TensorTensor: 42 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Activation: 36 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: DMACopy: 9 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Save: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: CoreBarrier: 5 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: CollectiveCompute: 3 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 8 +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: unroll finished after 0.040 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 195mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 644 memory location(s), 1 block(s), and 1058 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=644 blocks=1 instructions=1058 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:47 2025 + +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z USER 9029 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 197mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Total count: 1062 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Matmult: 783 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: TensorScalarPtr: 58 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: GenericCopy: 49 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Load: 45 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: TensorTensor: 42 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Activation: 36 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Save: 11 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: DMACopy: 10 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: CoreBarrier: 5 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: CollectiveCompute: 3 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 8 +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: unroll finished after 0.043 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 195mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 644 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=644 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:47 2025 + +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Total count: 602 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Matmult: 313 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: TensorScalarPtr: 68 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Load: 56 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: TensorTensor: 46 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: GenericCopy: 44 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Save: 23 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Activation: 17 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: DMACopy: 10 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 10 +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.042 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 195mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 628 memory location(s), 1 block(s), and 602 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=628 blocks=1 instructions=602 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:47 2025 + +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Total count: 604 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Matmult: 313 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: TensorScalarPtr: 68 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Load: 56 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: TensorTensor: 46 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: GenericCopy: 44 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Save: 24 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Activation: 17 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: DMACopy: 11 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 10 +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.057 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 196mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 628 memory location(s), 1 block(s), and 604 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=628 blocks=1 instructions=604 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z USER 9029 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 195mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9029 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.009 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 196mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:47Z USER 9029 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.017 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 197mb, ru_maxrss: 211mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:47 2025 + +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Total count: 13213 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Matmult: 10538 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: GenericCopy: 1420 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Load: 356 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Save: 320 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: TensorTensor: 41 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Activation: 33 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Memset: 22 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: TensorReduce: 8 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: unroll finished after 0.255 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 319mb, ru_maxrss: 319mb (delta=108mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5256 memory location(s), 1 block(s), and 13213 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5256 blocks=1 instructions=13213 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:47Z USER 9029 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.027 seconds +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:47Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:47 2025 + +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Total count: 13201 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Matmult: 10538 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: GenericCopy: 1420 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Load: 356 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Save: 308 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: TensorTensor: 41 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Activation: 33 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Memset: 22 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: TensorReduce: 8 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: unroll finished after 0.331 seconds +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=108mb) +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5256 memory location(s), 1 block(s), and 13201 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:47Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5256 blocks=1 instructions=13201 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:47Z INFO 9029 (nc01/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.033 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.381 seconds +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=108mb) +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6372 blocks=6 instructions=28910 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:48Z USER 9029 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:48Z USER 9029 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:48Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=602 blocks=2 instructions=2083 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 602 memory location(s), 2 block(s), and 2083 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5239 blocks=2 instructions=25624 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5239 memory location(s), 2 block(s), and 25624 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=531 blocks=2 instructions=1203 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 531 memory location(s), 2 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6372 blocks=6 instructions=28910 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1549_i1}@SB<0,0>(128x8192)#Internal DebugInfo: <_dot.6||UNDEF||[128, 4096, 1]> +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {_dot.199-t1285_i1}@SB<0,0>(128x8192)#Internal DebugInfo: <_dot.199||UNDEF||[128, 4096, 1]> +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1195_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1200_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.008 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 252mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.062 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.062 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.063 seconds +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6372 blocks=6 instructions=28910 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:48Z USER 9029 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:48Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=602 blocks=2 instructions=2083 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:48Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=531 blocks=2 instructions=1203 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=5239 blocks=2 instructions=25624 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 531 memory location(s), 2 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 602 memory location(s), 2 block(s), and 2083 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5239 memory location(s), 2 block(s), and 25624 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.006 seconds +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:48Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6372 blocks=6 instructions=28910 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:48Z WARNING 9029 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:48Z WARNING 9029 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z WARNING 9029 (nc01/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z WARNING 9029 (nc00/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 504 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 504 bytes/partition +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: psum_legalization finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.006 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: constant_propagate finished after 0.007 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: psum_legalization finished after 0.007 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Found 3 Splits CCs +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: Grouped CCs to 3 clusters. +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: constant_propagate finished after 0.008 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 260mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Found 3 Splits CCs +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: Grouped CCs to 3 clusters. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: End DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.006 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Allocs: 266 instructions: 603 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Allocs: 298 instructions: 1021 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Allocs: 265 instructions: 600 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: End DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Allocs: 304 instructions: 1062 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 1391 edges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Done build fdeps 1391 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.010 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=266 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=267 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: size = 63 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: found 73 edges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: mean: 2.31746 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: median: 0.667515 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 584 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: lo = 63 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: total = 63 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 15 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 17 PSUM Banks +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2162690 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 529408 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: size = 169 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: found 30 accumulation groups +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: largest = _dot.2-t1648_i2 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: tensors = 6 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: requires 20480 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.022 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: 15 remat count +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Num intervals 169 Num locations 169 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z WARNING 9029 (nc01/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 1389 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Done build fdeps 1389 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.021 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 265 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=265 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=266 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 2792 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Done build fdeps 2792 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 2896 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Done build fdeps 2896 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: pre_sched finished after 0.028 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: size = 63 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: found 73 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: mean: 2.31746 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: median: 0.667515 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 584 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.028 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: lo = 63 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: total = 63 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.003 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: pre_sched finished after 0.028 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 298 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=298 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.000 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 299 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=299 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 15 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 17 PSUM Banks +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: size = 81 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2162688 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 529408 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: found 121 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: mean: 2.98765 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: median: 1.99733 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: adjacency vectors require 968 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: lo = 81 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: total = 81 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: edge: 1585 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: mean: 18.7574 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: median: 14.539 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: safe = 164 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: unsafe = 3 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: total = 167 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 169 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Total: 167 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (167) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Rover zone: 0.964 (161) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.018 (3) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.018 (3) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.006 (1) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.994 (166) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.961 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z WARNING 9029 (nc00/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2162690 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 529408 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.022 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 268 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=268 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 14265606, 73.8143% input load, 4.1346% output write, 22.0511% spill/reload [sg0000] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 7 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.05301e+07) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.009 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 304 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=304 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 305 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=305 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 734 bytes +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2162690 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 734 bytes +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 14265606, 73.8143% input load, 4.1346% output write, 22.0511% spill/reload [sg0000] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2162690 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 529408 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 206 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1240 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.004 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=267 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: size = 168 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: found 30 accumulation groups +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: largest = _dot.2-t1648_i11 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: tensors = 6 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: requires 20480 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: size = 85 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: found 127 edges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: mean: 2.98824 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: median: 2.33158 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: adjacency vectors require 1016 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: 15 remat count +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Num intervals 168 Num locations 168 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.011 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: lo = 85 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: total = 85 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 20 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: edge: 1581 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: mean: 18.8214 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: median: 14.8754 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: safe = 163 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: unsafe = 3 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: total = 166 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 168 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Total: 166 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (166) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Rover zone: 0.964 (160) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.018 (3) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.018 (3) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.006 (1) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.994 (165) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.966 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2162688 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 529408 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.014 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=267 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 14265604, 73.8143% input load, 4.13459% output write, 22.0511% spill/reload [sg0000] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.004 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.05301e+07) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 16 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 5 PSUM Banks +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.008 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.008 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: vn_splitter finished after 0.024 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 28094980 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5225 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1310720 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1280 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: size = 180 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: found 65 accumulation groups +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: largest = _dot.9-t1491_i5 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: tensors = 10 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: requires 24576 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2162688 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 14265604, 73.8143% input load, 4.13459% output write, 22.0511% spill/reload [sg0000] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 12102916 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 6 PSUM Banks +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 28094980 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5225 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 3407874 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2660 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: 30 remat count +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Num intervals 180 Num locations 180 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: edge: 2249 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: mean: 24.9889 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: median: 20.0391 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: size = 182 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: found 69 accumulation groups +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: largest = _dot.9-t1491_i0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: tensors = 10 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: requires 24576 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: 30 remat count +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Num intervals 182 Num locations 182 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: edge: 2360 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: mean: 25.9341 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: median: 19.8467 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 1889 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2162688 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 734 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 529408 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 206 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1240 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=266 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: safe = 173 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: unsafe = 6 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: inf = 1 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: total = 180 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 182 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Total: 180 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Allocated: 1.000 (180) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Rover zone: 0.983 (177) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Pre-rover zone: 0.011 (2) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Post-rover zone: 0.006 (1) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Blocks nothing: 0.006 (1) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Blocks tall: 0.994 (179) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.975 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: Success +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: safe = 171 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: unsafe = 6 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: inf = 1 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: total = 178 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 180 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Total: 178 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Allocated: 1.000 (178) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Rover zone: 0.983 (175) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Pre-rover zone: 0.011 (2) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Post-rover zone: 0.006 (1) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Blocks nothing: 0.006 (1) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Blocks tall: 0.994 (177) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.980 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: Success +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 28094980 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5225 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 3407874 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2660 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 28094980 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5225 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1310720 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1280 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 266240 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.010 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 300 memory location(s), 1 block(s), and 1021 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=300 blocks=1 instructions=1021 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 29405700, 86.6279% input load, 0% output write, 13.3721% spill/reload [sg0001] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 131072, 0.445737% out of total dma traffic(2.54735e+07) +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:48Z INFO 9029 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.023 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.008 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=267 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.011 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: vn_splitter finished after 0.028 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: reserved space = 139520 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 306 memory location(s), 1 block(s), and 1062 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=306 blocks=1 instructions=1062 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 31502854, 80.8611% input load, 3.32852% output write, 15.8104% spill/reload [sg0001] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=267 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 1048576 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 1048576 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=267 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 7 out of 52 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 603 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=267 blocks=1 instructions=603 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 267 memory location(s), 1 block(s), and 604 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=267 blocks=1 instructions=604 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 604, number of allocs: 267 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2663-0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 6.4e-05 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2663-0] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: input0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: input1: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: input2: [ 4 256 128 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: output0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 0 +Memory Location: {reshape.16}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 0 +Memory Location: {reshape.24}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 256 / 256 = 1 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Scratch sbuf for kernel I-2663-0: [30208, 47436) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 131072, 0.416064% out of total dma traffic(2.54735e+07) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 0.003768 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 406 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=406 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 406 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=406 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 406 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=406 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 16 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 414 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=414 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 414 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=414 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 414 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 5460 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 1280 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 27963908 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5460 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1310720 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1280 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 5460 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 2660 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 27963908 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5460 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 3407874 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2660 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 131072, 0.445737% out of total dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 29274628, 86.568% input load, 0% output write, 13.432% spill/reload [sg0001] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 27963908 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5460 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1310720 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1280 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 266240 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3605 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.014 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 131072, 0.416064% out of total dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 31371782, 80.7811% input load, 3.34242% output write, 15.8765% spill/reload [sg0001] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=414 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 297 memory location(s), 1 block(s), and 1019 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 27963908 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5460 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=297 blocks=1 instructions=1019 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 3407874 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2660 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 266240 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3744 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 414 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=414 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Allocs: 414 instructions: 787 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 1633 edges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [build_flow_deps]: Done build fdeps 1633 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 414 memory location(s), 1 block(s), and 787 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=414 blocks=1 instructions=787 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.020 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=266 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 402 memory location(s), 1 block(s), and 759 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: reserved space = 139520 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=402 blocks=1 instructions=759 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=266 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 1048576 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 1048576 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=266 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 303 memory location(s), 1 block(s), and 1060 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=303 blocks=1 instructions=1060 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 7 out of 51 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 600 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=266 blocks=1 instructions=600 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 266 memory location(s), 1 block(s), and 601 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=266 blocks=1 instructions=601 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 601, number of allocs: 266 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2663-0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 5.5e-05 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2663-0] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: input0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: input1: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: input2: [ 4 256 128 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: output0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 131072 +Memory Location: {reshape.16}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 131072 +Memory Location: {reshape.24}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 256 / 256 = 1 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Scratch sbuf for kernel I-2663-0: [30208, 47436) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 0.011352 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.008 seconds +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.004 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 402 memory location(s), 1 block(s), and 759 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=402 blocks=1 instructions=759 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 405 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=405 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 405 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=405 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 405 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=405 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 8 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 16 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 413 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=413 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 413 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=413 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 262mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 751 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=393 blocks=1 instructions=751 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2663-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,32516>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 413 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=413 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 413 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=413 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Allocs: 413 instructions: 784 +2025-11-04T21:38:48Z USER 9029 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.004 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 751 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 1631 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [build_flow_deps]: Done build fdeps 1631 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 413 memory location(s), 1 block(s), and 784 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=413 blocks=1 instructions=784 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 401 memory location(s), 1 block(s), and 756 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=401 blocks=1 instructions=756 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 20 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.017 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 303 memory location(s), 1 block(s), and 1060 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=303 blocks=1 instructions=1060 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: reserved space = 73728 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 303 memory location(s), 1 block(s), and 1060 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=303 blocks=1 instructions=1060 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: allreduce hwm 1048576 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: Real CC buffer size 1048576 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 303 memory location(s), 1 block(s), and 1060 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=303 blocks=1 instructions=1060 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 14 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyAccel::Impl]: Accelerated 7 out of 59 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 303 memory location(s), 1 block(s), and 1060 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=303 blocks=1 instructions=1060 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 401 memory location(s), 1 block(s), and 756 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=401 blocks=1 instructions=756 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 303 memory location(s), 1 block(s), and 1061 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=303 blocks=1 instructions=1061 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 1061, number of allocs: 303 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2304-0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Scan BKs time (s): 0.000134 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.030 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2304-0] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: input0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: input1: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: input2: [ 4 256 128 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: output0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 0 +Memory Location: {reshape.60}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 0 +Memory Location: {reshape.68}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 256 / 256 = 1 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Scratch sbuf for kernel I-2304-0: [67648, 84876) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 297 memory location(s), 1 block(s), and 1019 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=297 blocks=1 instructions=1019 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 8 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: reserved space = 73728 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [LowerKernel]: Lower BKs time (s): 0.002793 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 297 memory location(s), 1 block(s), and 1019 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=297 blocks=1 instructions=1019 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 442 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: allreduce hwm 1048576 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: Real CC buffer size 1048576 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.003 seconds +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=442 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 748 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=392 blocks=1 instructions=748 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 442 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=442 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 442 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=442 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z USER 9029 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 748 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 297 memory location(s), 1 block(s), and 1019 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=297 blocks=1 instructions=1019 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyAccel::Impl]: Accelerated 3 out of 54 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 297 memory location(s), 1 block(s), and 1019 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=297 blocks=1 instructions=1019 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 263mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 297 memory location(s), 1 block(s), and 1020 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=297 blocks=1 instructions=1020 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 1020, number of allocs: 297 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2304-0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Scan BKs time (s): 0.000144 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2304-0] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: input0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: input1: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: input2: [ 4 256 128 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: output0: [ 4 128 256 ] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 131072 +Memory Location: {reshape.60}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[32768,4],[256,128],[1,256]] +Offset: 131072 +Memory Location: {reshape.68}@DRAM(262144x2)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 256 / 256 = 1 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Scratch sbuf for kernel I-2304-0: [57344, 74572) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: seq_len=256, seq_len2=256, complete_seq_len2=256 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [LowerKernel]: Lower BKs time (s): 0.004124 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 16 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 450 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=450 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 450 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=450 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,69956>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,69956>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,69956>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,69956>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 436 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=436 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 436 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=436 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 436 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=436 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 450 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=450 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 16 memorylocations +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 444 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=444 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 444 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=444 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1549_i1}@SB<0,36864>(128x8192)#Internal DebugInfo: <_dot.6||UNDEF||[128, 4096, 1]> +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,59652>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,59652>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,59652>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2304-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,59652>(128x4)#Internal DebugInfo: +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 450 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=450 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 7Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Allocs: 450 instructions: 1244 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 444 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=444 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 444 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=444 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 8Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Allocs: 444 instructions: 1203 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 3149 edges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [build_flow_deps]: Done build fdeps 3149 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: build_fdeps finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 450 memory location(s), 1 block(s), and 1244 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=450 blocks=1 instructions=1244 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 3045 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [build_flow_deps]: Done build fdeps 3045 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: build_fdeps finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 444 memory location(s), 1 block(s), and 1203 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=444 blocks=1 instructions=1203 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 438 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=438 blocks=1 instructions=1216 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 264mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 432 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=432 blocks=1 instructions=1175 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 432 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=432 blocks=1 instructions=1175 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 8 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.015 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 438 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=438 blocks=1 instructions=1216 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 8 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 423 memory location(s), 1 block(s), and 1167 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=423 blocks=1 instructions=1167 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 429 memory location(s), 1 block(s), and 1208 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=429 blocks=1 instructions=1208 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z USER 9029 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 429 memory location(s), 1 block(s), and 1208 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 423 memory location(s), 1 block(s), and 1167 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: constant_propagate finished after 0.100 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: lower_ac finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: constant_propagate finished after 0.102 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.009 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: lower_ac finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 265mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.013 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: remat_optimization finished after 0.028 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.005 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2395 memory location(s), 1 block(s), and 12421 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2395 blocks=1 instructions=12421 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: remat_optimization finished after 0.027 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 266mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2844 memory location(s), 1 block(s), and 13203 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2844 blocks=1 instructions=13203 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:48Z INFO 9029 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: End DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: End DCE Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [build_flow_deps]: Allocs: 2846 instructions: 13201 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 10Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [build_flow_deps]: Allocs: 2397 instructions: 12423 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 32298 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [build_flow_deps]: Done build fdeps 32298 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 44214 edges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [build_flow_deps]: Done build fdeps 44214 Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: pre_sched finished after 0.135 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2397 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:48 2025 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: pre_sched finished after 0.133 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2846 memory location(s), 1 block(s), and 13201 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2846 blocks=1 instructions=13201 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.032 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2397 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2398 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2398 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 63 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: size = 1018 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.035 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2783 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2784 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2784 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 274mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: found 1023 edges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: mean: 2.00982 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: median: 1.99394 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: adjacency vectors require 8184 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: size = 1142 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: lo = 944 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: total = 1018 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: found 1085 edges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: mean: 1.90018 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: median: 1.88528 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: adjacency vectors require 8680 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.041 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 275mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: lo = 1068 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: total = 1142 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.016 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 275mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.041 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 275mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.016 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 275mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 11 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 69 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 8 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 8 PSUM Banks +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.065 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 275mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 176600466 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4273 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1201920 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2161 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: size = 1344 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: found 1013 accumulation groups +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: largest = _dot.199-t1129_i15 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: tensors = 7 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: requires 18432 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.084 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 275mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 177235870 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4228 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2265867 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2098 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: 326 remat count +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: size = 1596 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Num intervals 1344 Num locations 1344 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: found 1137 accumulation groups +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: largest = _dot.199-t1129_i0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: tensors = 7 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: requires 18432 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: edge: 7996 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: mean: 11.8988 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: median: 5.43862 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:48Z INFO 9029 []: find first defs for local +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: safe = 1333 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: unsafe = 8 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: inf = 1 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: total = 1342 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1344 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 []: find first defs for global +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Total: 1342 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Allocated: 1.000 (1342) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Rover zone: 0.984 (1321) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Pre-rover zone: 0.012 (16) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Post-rover zone: 0.004 (5) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Blocks nothing: 0.017 (23) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Blocks medium: 0.001 (2) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.700 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.714 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.714 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Blocks tall: 0.981 (1317) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.761 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.997 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: Success +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: 336 remat count +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 176600466 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4273 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1201920 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2161 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Num intervals 1596 Num locations 1596 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: edge: 9474 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: mean: 11.8722 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: median: 5.97425 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.093 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: safe = 1583 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: unsafe = 10 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: inf = 1 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: total = 1594 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1596 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Total: 1594 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Allocated: 1.000 (1594) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Rover zone: 0.969 (1545) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Pre-rover zone: 0.024 (38) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Post-rover zone: 0.004 (7) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Slice zone: 0.003 (4) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Blocks nothing: 0.071 (113) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Blocks medium: 0.008 (12) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.573 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.595 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.815 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Blocks tall: 0.922 (1469) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.676 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.959 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: Success +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.024 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2399 memory location(s), 1 block(s), and 12423 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2399 blocks=1 instructions=12423 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 177802386, 98.1426% input load, 0% output write, 1.85742% spill/reload [sg0002] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.745e+08) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: average loaded DMA size 4286 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: average saved DMA size 2807 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 176600210 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4286 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1201664 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2807 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.0155032% out of total spill/reload dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 512, 0.00028796% out of total dma traffic +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 177801874, 98.1429% input load, 0% output write, 1.85714% spill/reload [sg0002] +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 176600210 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4286 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1201664 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2807 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4269 bytes +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.114 seconds +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12422 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2397 blocks=1 instructions=12422 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 128 Sb address +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 177235870 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4228 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2265867 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2098 bytes +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.323 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.019 seconds +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2785 memory location(s), 1 block(s), and 13138 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2785 blocks=1 instructions=13138 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 179501737, 97.3896% input load, 2.22839e-06% output write, 2.6104% spill/reload [sg0002] +2025-11-04T21:38:48Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.74816e+08) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: average loaded DMA size 4241 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: average saved DMA size 2379 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 177235614 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4241 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2265611 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2379 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.340 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12422 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2397 blocks=1 instructions=12422 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.0109268% out of total spill/reload dma traffic +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 512, 0.000285234% out of total dma traffic +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 179501225, 97.3899% input load, 2.2284e-06% output write, 2.61012% spill/reload [sg0002] +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 177235614 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4241 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2265611 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2379 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4197 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.198 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13137 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2783 blocks=1 instructions=13137 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: spill space = 1048576 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 1048576 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: size = 1 +2025-11-04T21:38:49Z INFO 9029 []: find first defs for local +2025-11-04T21:38:49Z INFO 9029 []: find first defs for global +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 147 Sb address +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: Num intervals 1 Num locations 1 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: lo = 1 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: total = 1 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.069 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12422 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2397 blocks=1 instructions=12422 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: allreduce hwm 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: Real CC buffer size 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.012 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12422 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2397 blocks=1 instructions=12422 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [TensorCopyAccel::Impl]: Accelerated 597 out of 1231 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12422 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2397 blocks=1 instructions=12422 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: peephole_opts finished after 0.003 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 12425, number of allocs: 2397 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [LowerKernel]: Scan BKs time (s): 0.000628 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.007 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {_dot.199-t1285_i1}@SB<0,36864>(128x8192)#Internal DebugInfo: <_dot.199||UNDEF||[128, 4096, 1]> +2025-11-04T21:38:49Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1195_i1}@SB<32,16384>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:49Z WARNING 9029 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1200_i1}@SB<96,17536>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.036 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 11Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [build_flow_deps]: Allocs: 2397 instructions: 12425 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 32300 edges +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [build_flow_deps]: Done build fdeps 32300 Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: build_fdeps finished after 0.062 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: remove_redundancies finished after 0.005 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 28 Sb address +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 156 Sb address +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.096 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 296mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.308 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 285mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13137 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2783 blocks=1 instructions=13137 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.024 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2397 blocks=1 instructions=12425 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.008 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2397 memory location(s), 1 block(s), and 12425 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: reserved space = 34824 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: spill space = 1055748 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 1077248 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: size = 8 +2025-11-04T21:38:49Z INFO 9029 []: find first defs for local +2025-11-04T21:38:49Z INFO 9029 []: find first defs for global +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: lo = 8 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: total = 8 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.054 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 288mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13137 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2783 blocks=1 instructions=13137 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: allreduce hwm 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: Real CC buffer size 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 1048576 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.019 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13137 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2783 blocks=1 instructions=13137 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [TensorCopyAccel::Impl]: Accelerated 597 out of 1370 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.010 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13137 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2783 blocks=1 instructions=13137 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: peephole_opts finished after 0.003 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 13140, number of allocs: 2783 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [LowerKernel]: Scan BKs time (s): 0.000692 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.024 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.027 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 12Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [build_flow_deps]: Allocs: 2783 instructions: 13140 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 44153 edges +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [build_flow_deps]: Done build fdeps 44153 Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: build_fdeps finished after 0.047 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 285mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: remove_redundancies finished after 0.006 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 285mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.078 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 300mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.056 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2783 blocks=1 instructions=13140 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.017 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2783 memory location(s), 1 block(s), and 13140 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 1.764 seconds +2025-11-04T21:38:49Z INFO 9029 [BackendPassManager]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:49Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6817 blocks=6 instructions=29439 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:49Z USER 9029 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=852 blocks=2 instructions=2375 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z USER 9029 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=785 blocks=2 instructions=1499 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9029 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z USER 9029 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5180 blocks=2 instructions=25565 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 852 memory location(s), 2 block(s), and 2375 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z USER 9029 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 785 memory location(s), 2 block(s), and 1499 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9029 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=785 blocks=2 instructions=1499 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=852 blocks=2 instructions=2375 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z USER 9029 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z USER 9029 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 785 memory location(s), 2 block(s), and 1503 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9029 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=785 blocks=2 instructions=1503 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5180 memory location(s), 2 block(s), and 25565 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:49Z USER 9029 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 854 memory location(s), 2 block(s), and 2383 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z USER 9029 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=854 blocks=2 instructions=2383 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=5180 blocks=2 instructions=25565 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.006 seconds +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 785 memory location(s), 2 block(s), and 1507 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z USER 9029 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.006 seconds +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 854 memory location(s), 2 block(s), and 2387 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z USER 9029 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.015 seconds +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5186 memory location(s), 2 block(s), and 25583 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=5186 blocks=2 instructions=25583 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.056 seconds +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5186 memory location(s), 2 block(s), and 25587 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:49Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.075 seconds +2025-11-04T21:38:49Z INFO 9029 [BackendPassManager]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29481 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:49Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:49Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:49Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2400 blocks=1 instructions=12436 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=424 blocks=1 instructions=1173 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: reserved space = 106752 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: spill space = 5767168 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 5767168 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: size = 8 +2025-11-04T21:38:49Z INFO 9029 []: find first defs for local +2025-11-04T21:38:49Z INFO 9029 []: find first defs for global +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: lo = 8 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: total = 8 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 3670016 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 3670016 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: reserved space = 40960 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: spill space = 7340032 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 7340032 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:49Z USER 9029 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1173 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: reserved space = 40960 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: spill space = 7340032 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 7340032 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: size = 9 +2025-11-04T21:38:49Z INFO 9029 []: find first defs for local +2025-11-04T21:38:49Z INFO 9029 []: find first defs for global +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: Num intervals 9 Num locations 9 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: lo = 9 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: total = 9 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 4194304 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 4194304 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 6291456 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:49Z USER 9029 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.004 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: reserved space = 106752 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: spill space = 5767168 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 5767168 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:49Z USER 9029 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.005 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 5767168 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:49Z USER 9029 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.008 seconds +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: reserved space = 1081344 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: spill space = 3464194 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 3510272 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:49Z USER 9029 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.041 seconds +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12436 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: reserved space = 1090572 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: spill space = 3464194 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 3510272 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:49Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: size = 18 +2025-11-04T21:38:49Z INFO 9029 []: find first defs for local +2025-11-04T21:38:49Z INFO 9029 []: find first defs for global +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: Num intervals 18 Num locations 18 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: lo = 18 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: total = 18 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 1048576 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 1048576 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 3162112 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 3162112 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 4210688 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9029 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.082 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 294mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.084 seconds +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29481 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:50Z USER 9029 (sg01) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:50Z USER 9029 (sg02) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:50Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=785 blocks=2 instructions=1507 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=854 blocks=2 instructions=2387 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (sg01) [SubgraphForkPass]: sync_shared_allocations finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=5186 blocks=2 instructions=25587 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 854 memory location(s), 2 block(s), and 2387 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (sg02) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z USER 9029 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5186 memory location(s), 2 block(s), and 25587 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 785 memory location(s), 2 block(s), and 1507 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29481 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:50Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=424 blocks=1 instructions=1173 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.001 seconds +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2400 blocks=1 instructions=12436 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1173 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.025 seconds +2025-11-04T21:38:50Z USER 9029 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.027 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12436 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.028 seconds +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: curr_vmrss: 290mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29481 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:50Z INFO 9029 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=3216 blocks=3 instructions=14361 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:50Z INFO 9029 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=3609 blocks=3 instructions=15120 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.165 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 297mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3216 memory location(s), 3 block(s), and 14361 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.183 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 294mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3609 memory location(s), 3 block(s), and 15120 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: nc_parallel_pass finished after 0.189 seconds +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29481 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=424 blocks=1 instructions=1173 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1173 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=424 blocks=1 instructions=1173 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2400 blocks=1 instructions=12436 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12436 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2400 blocks=1 instructions=12436 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:50Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:50Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware simulation time: 219864 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: post_sched finished after 0.026 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware simulation time: 215132 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: post_sched finished after 0.036 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware simulation time: 9150759 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: post_sched finished after 0.047 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z USER 9029 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:50Z USER 9029 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware simulation time: 8909541 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: post_sched finished after 0.072 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1173 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=424 blocks=1 instructions=1173 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1173 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=424 blocks=1 instructions=1173 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z USER 9029 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 319mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Time-aware simulation time: 1176771 +2025-11-04T21:38:50Z INFO 9029 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: post_sched finished after 0.708 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 334mb, ru_maxrss: 334mb (delta=15mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12436 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2400 blocks=1 instructions=12436 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 326mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12436 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2400 blocks=1 instructions=12436 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9029 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 327mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12431 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9029 [post_scheduler]: Time-aware simulation time: 1340570 +2025-11-04T21:38:51Z INFO 9029 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: post_sched finished after 0.987 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 328mb, ru_maxrss: 334mb (delta=15mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.022 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:51Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 1.025 seconds +2025-11-04T21:38:51Z INFO 9029 [BackendPassManager]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=15mb) +2025-11-04T21:38:51Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:51Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29475 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:51Z USER 9029 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:51Z USER 9029 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:51Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=785 blocks=2 instructions=1507 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=854 blocks=2 instructions=2386 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z USER 9029 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 785 memory location(s), 2 block(s), and 1507 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9029 (sg01) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 854 memory location(s), 2 block(s), and 2386 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z INFO 9029 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5186 blocks=2 instructions=25582 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9029 (sg02) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5186 memory location(s), 2 block(s), and 25582 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:51Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9029 [BackendPassManager]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:51Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29475 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=424 blocks=1 instructions=1172 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2400 blocks=1 instructions=12431 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 34 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 12 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 34 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 12 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 42 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 42 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 17 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 17 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 44 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 50 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 51 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 39 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 53 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 11 PSUM Banks +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.026 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.026 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 19 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 39 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.035 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.012 seconds +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.010 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 43 Sb address +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.011 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 13Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [build_flow_deps]: Allocs: 430 instructions: 1214 +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.011 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 14Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [build_flow_deps]: Allocs: 393 instructions: 755 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 3125 edges +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [build_flow_deps]: Done build fdeps 3125 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.013 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 15Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [build_flow_deps]: Allocs: 392 instructions: 752 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.055 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=424 blocks=1 instructions=1172 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: dep_opt finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal │ 1 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 3 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 35 │ 25174532 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 23 │ 3407872 │ +│ Save │ Internal │ 17 │ 2621440 │ +│ Save │ Internal -> Output │ 2 │ 1048578 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 2 │ +│ 256 │ 25 │ +│ 512 │ 10 │ +│ 2048 │ 4 │ +│ 4096 │ 5 │ +│ 6144 │ 8 │ +│ 8192 │ 20 │ +│ 524288 │ 3 │ +│ 1048576 │ 10 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ReportStats]: MM Stats: #MatMults 815 #MatMult-Transposes 108 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input68_local_1337_i3 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i2 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i1 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i0 │ Internal │ bfloat16 │ 1572864 │ +│ intermediate5 │ Output │ bfloat16 │ 1048576 │ +│ all_reduce.1-buffer-2286 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate3 │ Input │ bfloat16 │ 1048576 │ +│ dot.11-buffer-2289 │ Internal │ bfloat16 │ 1048576 │ +│ dot.7-buffer-2284 │ Internal │ bfloat16 │ 1048576 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z USER 9029 (nc00/sg01) [ModuleForkPass]: report_stats finished after 0.003 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1214 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 1641 edges +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [build_flow_deps]: Done build fdeps 1641 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 1634 edges +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [build_flow_deps]: Done build fdeps 1634 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.013 seconds +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.011 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.013 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 2 │ 622329856 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 38144 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10491908 │ +│ Load │ Internal │ 53 │ 2359296 │ +│ Save │ Internal │ 28 │ 1835008 │ +│ Save │ Internal -> Output │ 3 │ 589824 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 2 │ 622329856 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 38144 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10491908 │ +│ Load │ Internal │ 53 │ 2359296 │ +│ Save │ Internal │ 28 │ 1835008 │ +│ Save │ Internal -> Output │ 4 │ 589826 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 8 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 1 │ +│ 256 │ 76 │ +│ 512 │ 6 │ +│ 2048 │ 6 │ +│ 4096 │ 2 │ +│ 8192 │ 10 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 8 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 1 │ +│ 256 │ 76 │ +│ 512 │ 6 │ +│ 2048 │ 6 │ +│ 4096 │ 2 │ +│ 8192 │ 10 │ +│ 1048576 │ 10 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 345 #MatMult-Transposes 53 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 345 #MatMult-Transposes 53 +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=424 blocks=1 instructions=1172 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ReportStats]: IO Tensor size combined: 457972228 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ReportStats]: IO Tensor size combined: 457972228 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input63 │ ExternalInput │ bfloat16 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input63 │ ExternalInput │ bfloat16 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input67_local_1548_i0 │ Internal │ bfloat16 │ 1048576 │ +│ input61_local_1603_i0 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate3 │ Output │ bfloat16 │ 1048576 │ +│ intermediate0 │ Output │ bfloat16 │ 1048576 │ +│ input61_local_1603_i1 │ Internal │ bfloat16 │ 1048576 │ +│ all_gather.1 │ Internal │ bfloat16 │ 1048576 │ +│ input67_local_1548_i2 │ Internal │ bfloat16 │ 1048576 │ +│ input67_local_1548_i1 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate3-buffer-2651 │ Internal │ bfloat16 │ 1048576 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z USER 9029 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input67_local_1548_i4 │ Internal │ bfloat16 │ 1048576 │ +│ input61_local_1603_i2 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate3 │ Output │ bfloat16 │ 1048576 │ +│ intermediate0 │ Output │ bfloat16 │ 1048576 │ +│ input61_local_1603_i3 │ Internal │ bfloat16 │ 1048576 │ +│ all_gather.1 │ Internal │ bfloat16 │ 1048576 │ +│ input67_local_1548_i6 │ Internal │ bfloat16 │ 1048576 │ +│ input67_local_1548_i5 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate3-buffer-2651 │ Internal │ bfloat16 │ 1048576 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 755 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 752 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=424 blocks=1 instructions=1172 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 16Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [build_flow_deps]: Allocs: 424 instructions: 1172 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 3032 edges +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [build_flow_deps]: Done build fdeps 3032 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: dep_opt finished after 0.009 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=424 blocks=1 instructions=1172 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 3 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 35 │ 25174532 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 23 │ 3407872 │ +│ Save │ Internal │ 16 │ 1572864 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 2 │ +│ 256 │ 25 │ +│ 512 │ 10 │ +│ 2048 │ 4 │ +│ 4096 │ 5 │ +│ 6144 │ 8 │ +│ 8192 │ 18 │ +│ 524288 │ 3 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ReportStats]: MM Stats: #MatMults 783 #MatMult-Transposes 76 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input68_local_1337_i7 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i6 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i5 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i4 │ Internal │ bfloat16 │ 1572864 │ +│ intermediate5 │ Output │ bfloat16 │ 1048576 │ +│ all_reduce.1-buffer-2286 │ Internal │ bfloat16 │ 1048576 │ +│ intermediate3 │ Input │ bfloat16 │ 1048576 │ +│ dot.11-buffer-2289 │ Internal │ bfloat16 │ 1048576 │ +│ dot.7-buffer-2284 │ Internal │ bfloat16 │ 1048576 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:51Z USER 9029 (nc01/sg01) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 742 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 5 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 689 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 338 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 39 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 5 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 43 PSUM Banks +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 42 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 36 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 36 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 32 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.651 seconds +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:51Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9029 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.659 seconds +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 334mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12431 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2400 blocks=1 instructions=12431 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:51Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.199 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 349mb, ru_maxrss: 349mb (delta=15mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.218 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 339mb, ru_maxrss: 349mb (delta=15mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12431 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2400 blocks=1 instructions=12431 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.043 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 17Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.029 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 329mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12431 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2400 blocks=1 instructions=12431 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [build_flow_deps]: Allocs: 2786 instructions: 13151 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 18Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [build_flow_deps]: Allocs: 2400 instructions: 12431 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 42602 edges +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 32301 edges +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [build_flow_deps]: Done build fdeps 32301 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [build_flow_deps]: Done build fdeps 42602 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: dep_opt finished after 0.125 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12431 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2400 blocks=1 instructions=12431 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174467084 │ +│ Load │ Internal │ 10 │ 2100358 │ +│ Save │ Internal │ 301 │ 1201664 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 3 │ +│ 4096 │ 297 │ +│ 6144 │ 8 │ +│ 8192 │ 15 │ +│ 524288 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ReportStats]: MM Stats: #MatMults 10410 #MatMult-Transposes 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ReportStats]: IO Tensor size combined: 348922896 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 1024 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input365_local_1017_i7 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i6 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i5 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i4 │ Internal │ bfloat16 │ 1572864 │ +│ intermediate84 │ Input │ bfloat16 │ 1048576 │ +│ all_reduce.3-buffer-2001 │ Internal │ bfloat16 │ 1048576 │ +│ convert.53 │ Internal │ bfloat16 │ 1048576 │ +│ input366_local_994_i7 │ Internal │ bfloat16 │ 1048576 │ +│ input366_local_994_i6 │ Internal │ bfloat16 │ 1048576 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: report_stats finished after 0.009 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12431 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: dep_opt finished after 0.161 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal │ 4 │ 1048576 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174467084 │ +│ Load │ Internal │ 24 │ 2419594 │ +│ Save │ Internal │ 319 │ 2265607 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 19 │ +│ 2048 │ 1 │ +│ 4096 │ 297 │ +│ 6144 │ 8 │ +│ 8192 │ 16 │ +│ 9496 │ 2 │ +│ 524288 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ReportStats]: MM Stats: #MatMults 10534 #MatMult-Transposes 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ReportStats]: IO Tensor size combined: 348922896 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 1024 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input365_local_1017_i3 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i2 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i1 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i0 │ Internal │ bfloat16 │ 1572864 │ +│ intermediate84 │ Input │ bfloat16 │ 1048576 │ +│ convert.53 │ Internal │ bfloat16 │ 1048576 │ +│ add.9 │ Internal │ bfloat16 │ 1048576 │ +│ -t2982 │ Internal │ float32 │ 1048576 │ +│ -t2976 │ Internal │ float32 │ 1048576 │ +└────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: report_stats finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13151 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 1.069 seconds +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=15mb) +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Inputs to assign_trigger_engine: modules=6 functions=6 allocs=6825 blocks=6 instructions=29475 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 61 DMA instructions. Moved 33 DMA instructions to CC's engines. +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 60 DMA instructions. Moved 32 DMA instructions to CC's engines. +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [AssignTriggerEngine]: Assigned trigger engine for 19 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [AssignTriggerEngine]: Assigned trigger engine for 17 DMA instructions. Moved 1 DMA instructions to CC's engines. +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [AssignTriggerEngine]: Assigned trigger engine for 323 DMA instructions. Moved 4 DMA instructions to CC's engines. +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [AssignTriggerEngine]: Assigned trigger engine for 303 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-11-04T21:38:52Z INFO 9029 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: assign_trigger_engine finished after 0.027 seconds +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Output has 6 module(s), 6 function(s), 6825 memory location(s), 6 block(s), and 29475 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29475 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=430 blocks=1 instructions=1214 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=424 blocks=1 instructions=1172 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2786 blocks=1 instructions=13151 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2400 blocks=1 instructions=12431 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=392 blocks=1 instructions=752 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 754 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1174 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=393 blocks=1 instructions=755 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 757 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12434 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.007 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13154 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.008 seconds +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=6 functions=6 allocs=6825 blocks=6 instructions=29489 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: assign_hwdge_engine finished after 0.010 seconds +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Output has 6 module(s), 6 function(s), 6825 memory location(s), 6 block(s), and 29489 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29489 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=393 blocks=1 instructions=757 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 4 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 8 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 57 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 29 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 28 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=430 blocks=1 instructions=1216 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=424 blocks=1 instructions=1174 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2400 blocks=1 instructions=12434 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 9 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 27 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 46 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 16 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 8 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 26 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 44 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 16 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=430 blocks=1 instructions=1216 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1174 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=424 blocks=1 instructions=1174 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1174 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=392 blocks=1 instructions=754 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 8 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 57 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 28 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 28 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 754 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=392 blocks=1 instructions=754 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2786 blocks=1 instructions=13154 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 754 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 7 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 298 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 2 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 1 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 3 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 320 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 1 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 757 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: alloc_queues finished after 0.005 seconds +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=393 blocks=1 instructions=757 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12434 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 757 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2400 blocks=1 instructions=12434 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 7 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 10 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 9 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 24 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 301 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 9 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 320 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 1 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: alloc_queues finished after 0.008 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13154 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2786 blocks=1 instructions=13154 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12434 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.007 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13154 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.017 seconds +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29489 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:52Z USER 9029 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:52Z INFO 9029 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=3216 blocks=3 instructions=14362 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=3609 blocks=3 instructions=15127 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:52Z USER 9029 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3609 memory location(s), 3 block(s), and 15127 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3216 memory location(s), 3 block(s), and 14362 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: nc_parallel_pass finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29489 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=424 blocks=1 instructions=1174 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1174 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=424 blocks=1 instructions=1174 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2400 blocks=1 instructions=12434 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12434 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2400 blocks=1 instructions=12434 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: lower_control finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1174 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=424 blocks=1 instructions=1174 Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=430 blocks=1 instructions=1216 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=430 blocks=1 instructions=1216 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=392 blocks=1 instructions=754 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 754 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2786 blocks=1 instructions=13154 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=392 blocks=1 instructions=754 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13154 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2786 blocks=1 instructions=13154 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 754 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=392 blocks=1 instructions=754 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 571 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 624 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 624 +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: lower_control finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=430 blocks=1 instructions=1216 Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 1165 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 1237 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 1237 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 1109 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 1174 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 1174 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=393 blocks=1 instructions=757 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.011 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 757 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=393 blocks=1 instructions=757 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 757 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=393 blocks=1 instructions=757 Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 573 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 626 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 626 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Finished dependency reduction: 3070 removed, new total 407 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:52Z USER 9029 (nc00/sg00) [ModuleForkPass]: dep_reduction finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 393 memory location(s), 1 block(s), and 757 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Finished dependency reduction: 3074 removed, new total 407 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:52Z USER 9029 (nc01/sg00) [ModuleForkPass]: dep_reduction finished after 0.022 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 392 memory location(s), 1 block(s), and 754 instruction(s). Max writers: 16 Max Readers: 52 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: lower_control finished after 0.029 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12434 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2400 blocks=1 instructions=12434 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Finished dependency reduction: 5732 removed, new total 438 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Finished dependency reduction: 5729 removed, new total 429 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:52Z USER 9029 (nc01/sg01) [ModuleForkPass]: dep_reduction finished after 0.034 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1174 instruction(s). Max writers: 16 Max Readers: 76 +2025-11-04T21:38:52Z USER 9029 (nc00/sg01) [ModuleForkPass]: dep_reduction finished after 0.026 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 430 memory location(s), 1 block(s), and 1216 instruction(s). Max writers: 16 Max Readers: 108 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: lower_control finished after 0.036 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13154 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2786 blocks=1 instructions=13154 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 10965 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 11591 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 11591 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 11508 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 12501 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 12501 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Finished dependency reduction: 54661 removed, new total 3063 +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:52Z USER 9029 (nc01/sg02) [ModuleForkPass]: dep_reduction finished after 0.246 seconds +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2400 memory location(s), 1 block(s), and 12434 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Finished dependency reduction: 71935 removed, new total 3887 +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:52Z USER 9029 (nc00/sg02) [ModuleForkPass]: dep_reduction finished after 0.284 seconds +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9029 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2786 memory location(s), 1 block(s), and 13154 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.336 seconds +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: curr_vmrss: 339mb, ru_maxrss: 349mb (delta=0mb) +2025-11-04T21:38:52Z USER 9029 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:52Z INFO 9029 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=6825 blocks=6 instructions=29489 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9029 (nc01) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:52Z INFO 9029 (nc01) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=3216 blocks=3 instructions=14362 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc01/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:52Z INFO 9029 (nc01/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:52Z INFO 9029 (nc01/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:52Z INFO 9029 (nc01/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:52Z USER 9029 (nc00) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:52Z INFO 9029 (nc00) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=3609 blocks=3 instructions=15127 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9029 (nc00/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:52Z INFO 9029 (nc00/sgLnk) [BirLinker]: DMA Descriptor ReUse Enabled. +2025-11-04T21:38:52Z INFO 9029 (nc00/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:52Z INFO 9029 (nc00/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:52Z INFO 9029 (nc00/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:52Z INFO 9029 (nc00/sgLnk) [BirLinker]: Added a new SpillReload Que qSPPIOParam0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f/nc01/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [BirLinker]: PostLink Stats: #MatMults 31896 #MatMult-Transposes 7251 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [BirLinker]: Total Intermediate MMTs 108 #out: 0 #inp: 108 #symmetric: 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: bir_linker finished after 0.359 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 467mb, ru_maxrss: 467mb (delta=118mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 223439258, 94.1519% input load, 0.263975% output write, 5.5841% spill/reload +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: postlnk_dma_report finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 425mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running report_stats +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 2 │ 622329856 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 38144 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10491908 │ +│ Load │ Internal │ 53 │ 2359296 │ +│ Save │ Internal │ 28 │ 1835008 │ +│ Save │ Internal -> Output │ 3 │ 589824 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 8 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 1 │ +│ 256 │ 76 │ +│ 512 │ 6 │ +│ 2048 │ 6 │ +│ 4096 │ 2 │ +│ 8192 │ 10 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 3 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 35 │ 25174532 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 23 │ 3407872 │ +│ Save │ Internal │ 16 │ 1572864 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 2 │ +│ 256 │ 25 │ +│ 512 │ 10 │ +│ 2048 │ 4 │ +│ 4096 │ 5 │ +│ 6144 │ 8 │ +│ 8192 │ 18 │ +│ 524288 │ 3 │ +│ 1048576 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f/nc00/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal │ 1 │ 1048576 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174467084 │ +│ Load │ Internal │ 10 │ 2100358 │ +│ Save │ Internal │ 301 │ 1201664 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 3 │ +│ 4096 │ 297 │ +│ 6144 │ 8 │ +│ 8192 │ 15 │ +│ 524288 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [BirLinker]: PostLink Stats: #MatMults 32884 #MatMult-Transposes 8115 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [BirLinker]: Total Intermediate MMTs 108 #out: 0 #inp: 108 #symmetric: 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: MM Stats: #MatMults 11538 #MatMult-Transposes 5275 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: IO Tensor size combined: 6781415468 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input68_local_1337_i7_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i5_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i6_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i4_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i7_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i6_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i5_sg0001 │ Internal │ bfloat16 │ 1572864 │ +└───────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: report_stats finished after 0.010 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 425mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: Real CC buffer size 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.023 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 424mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: spill space = 58851384 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 58966016 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.020 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: bir_linker finished after 0.476 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=118mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 227235765, 92.718% input load, 0.721017% output write, 6.56095% spill/reload +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: postlnk_dma_report finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running report_stats +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 2 │ 622329856 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 38144 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10491908 │ +│ Load │ Internal │ 53 │ 2359296 │ +│ Save │ Internal │ 28 │ 1835008 │ +│ Save │ Internal -> Output │ 4 │ 589826 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 8 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 1 │ +│ 256 │ 76 │ +│ 512 │ 6 │ +│ 2048 │ 6 │ +│ 4096 │ 2 │ +│ 8192 │ 10 │ +│ 1048576 │ 10 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal │ 1 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 8 │ 268435456 │ +│ DMACopy │ Internal -> Output │ 1 │ 2097152 │ +│ DMACopy (Spill) │ Internal │ 8 │ 0 │ +│ Load │ Const -> Internal │ 3 │ 36864 │ +│ Load │ ExternalInput -> Internal │ 35 │ 25174532 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 23 │ 3407872 │ +│ Save │ Internal │ 17 │ 2621440 │ +│ Save │ Internal -> Output │ 2 │ 1048578 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 16 │ 2 │ +│ 32 │ 2 │ +│ 256 │ 25 │ +│ 512 │ 10 │ +│ 2048 │ 4 │ +│ 4096 │ 5 │ +│ 6144 │ 8 │ +│ 8192 │ 20 │ +│ 524288 │ 3 │ +│ 1048576 │ 10 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 1572864 │ +│ DMACopy │ Internal │ 4 │ 1048576 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 324 │ 174467084 │ +│ Load │ Internal │ 24 │ 2419594 │ +│ Save │ Internal │ 319 │ 2265607 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 19 │ +│ 2048 │ 1 │ +│ 4096 │ 297 │ +│ 6144 │ 8 │ +│ 8192 │ 16 │ +│ 9496 │ 2 │ +│ 524288 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: MM Stats: #MatMults 11694 #MatMult-Transposes 5307 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: IO Tensor size combined: 6781415468 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────────┼──────────┼──────────┼──────────────┤ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ input68_local_1337_i3_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i1_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i2_sg0001 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i0_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i3_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1017_i2_sg0002 │ Internal │ bfloat16 │ 1572864 │ +│ input68_local_1337_i1_sg0001 │ Internal │ bfloat16 │ 1572864 │ +└───────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: report_stats finished after 0.006 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.025 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: spill space = 58851384 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 58966016 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: size = 86 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 86 Num locations 86 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: lo = 86 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: total = 86 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 6291456 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 10629120 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.023 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: nc_parallel_pass finished after 0.550 seconds +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=118mb) +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=7853 blocks=8 instructions=29573 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:53Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=8 allocs=7853 blocks=8 instructions=29573 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 7853 memory location(s), 8 block(s), and 29573 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=8 allocs=7853 blocks=8 instructions=29573 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.013 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.026 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.007 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14404 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=3730 blocks=4 instructions=14404 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14411 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=3730 blocks=4 instructions=14411 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.011 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1139/1139 (100% DGE) + power-of-2 partition : 1139/1173 (97.1014% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1140/1174 (97.1039% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 1223/1223 (100% DGE) + power-of-2 partition : 1223/1532 (79.8303% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1223/1532 (79.8303% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 28 + Transpose : 224 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 226/226 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: lower_dma finished after 0.021 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15169 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14411 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=4123 blocks=4 instructions=15169 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=3730 blocks=4 instructions=14411 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15176 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=4123 blocks=4 instructions=15176 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: expand_all_engine finished after 0.004 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14411 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=3730 blocks=4 instructions=14411 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.017 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14411 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=3730 blocks=4 instructions=14411 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: expand_inst_late finished after 0.018 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14456 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=3730 blocks=4 instructions=14456 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [SeqInstOpt]: Removing 18 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [SeqInstOpt]: Removing 17 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 14421 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=3730 blocks=4 instructions=14421 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: lower_sync finished after 0.008 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 15250 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=3730 blocks=4 instructions=15250 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1166/1166 (100% DGE) + power-of-2 partition : 1194/1257 (94.9881% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1195/1258 (94.9921% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 1256/1256 (100% DGE) + power-of-2 partition : 1256/1598 (78.5983% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1256/1598 (78.5983% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 56 + Transpose : 224 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 226/226 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: lower_dma finished after 0.051 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15176 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=4123 blocks=4 instructions=15176 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: lower_act finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 15265 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=3730 blocks=4 instructions=15265 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: expand_all_engine finished after 0.004 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15176 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=4123 blocks=4 instructions=15176 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.020 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 415mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15176 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=4123 blocks=4 instructions=15176 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: expand_inst_late finished after 0.022 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 418mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15221 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=4123 blocks=4 instructions=15221 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [SeqInstOpt]: Removing 18 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [SeqInstOpt]: Removing 17 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 418mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 15186 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=4123 blocks=4 instructions=15186 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: lower_dve finished after 0.056 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 418mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 15265 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=3730 blocks=4 instructions=15265 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: lower_sync finished after 0.010 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 418mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 16156 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=4123 blocks=4 instructions=16156 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: lower_ap finished after 0.004 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 418mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: lower_act finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 15265 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 418mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 16172 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=3730 blocks=4 instructions=15265 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=4123 blocks=4 instructions=16172 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local reg +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global reg +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local reg +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global reg +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local reg +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global reg +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:53Z USER 9029 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.052 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: curr_vmrss: 421mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 15265 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: lower_dve finished after 0.069 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 344mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 16172 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=4123 blocks=4 instructions=16172 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: lower_ap finished after 0.009 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 344mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 16172 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=4123 blocks=4 instructions=16172 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local reg +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global reg +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local reg +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global reg +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:53Z INFO 9029 []: find first defs for local reg +2025-11-04T21:38:53Z INFO 9029 []: find first defs for global reg +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:53Z USER 9029 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.080 seconds +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: curr_vmrss: 347mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 16172 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: nc_parallel_pass finished after 0.343 seconds +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: curr_vmrss: 347mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: vnc_remote_addr_map finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: curr_vmrss: 347mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7853 memory location(s), 8 block(s), and 31437 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: Running vnc_link +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 [VncLink]: Found 0 remote updates +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: vnc_link finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: curr_vmrss: 347mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7853 memory location(s), 8 block(s), and 31437 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z USER 9029 (nc00/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=3730 blocks=4 instructions=15265 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=4123 blocks=4 instructions=16172 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc00/sgLnk) [ModuleForkPass]: birverifier finished after 0.079 seconds +2025-11-04T21:38:53Z USER 9029 (nc01/sgLnk) [ModuleForkPass]: birverifier finished after 0.079 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 348mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 348mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 15265 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 16172 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.091 seconds +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: curr_vmrss: 348mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:53Z INFO 9029 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.006 seconds +2025-11-04T21:38:53Z INFO 9029 (sg00) [SubgraphForkPass]: curr_vmrss: 348mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9029 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 7853 memory location(s), 8 block(s), and 31437 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: subgraph_parallel_pass finished after 0.018 seconds +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: curr_vmrss: 348mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:53Z USER 9029 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:53Z INFO 9029 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9029 (nc01/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:53Z USER 9029 (nc00/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=3730 blocks=4 instructions=15265 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=4123 blocks=4 instructions=16172 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:53Z INFO 9029 (nc00/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89232 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000458729 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89232 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000456814 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 11625 │ +│ LDWEIGHTS │ 11623 │ +│ EVENT_SEMAPHORE │ 829 │ +│ CAST │ 663 │ +│ COPY │ 656 │ +│ UNKNOWN(0xd4) │ 523 │ +│ PSEUDO_DMA_TRIGGER │ 336 │ +│ ACTIVATE │ 222 │ +│ TENSOR_TENSOR │ 159 │ +│ UNKNOWN(0xd3) │ 145 │ +│ UNKNOWN(0xd8) │ 53 │ +│ MEMSET │ 46 │ +│ TENSOR_SCALAR_ADDR │ 41 │ +│ UNKNOWN(0xda) │ 26 │ +│ UNKNOWN(0x92) │ 24 │ +│ TENSOR_SCALAR │ 22 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ TENSOR_REDUCE │ 19 │ +│ RECIPROCAL │ 17 │ +│ UNKNOWN(0x9a) │ 16 │ +│ UNKNOWN(0x9b) │ 16 │ +│ STREAM_SHUFFLE │ 16 │ +│ LOAD_MASK_SELECT │ 16 │ +│ UNKNOWN(0x24) │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 15 │ +│ MOVE │ 7 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ UNKNOWN(0xe8) │ 5 │ +│ ALU_OP │ 2 │ +│ IOTA │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 966 │ +│ Scalar │ 2332 │ +│ Tensor │ 23340 │ +│ SyncDMA │ 0 │ +│ Vector │ 422 │ +│ Sync │ 157 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:53Z USER 9029 (nc01/sgLnk) [Codegen]: isa_gen finished after 0.227 seconds +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 596 │ +│ qDVESpillReload0_defId_2 │ 2 │ +│ qPoolSpillReload0_defId_0 │ 4096 │ +│ qPoolSpillReload0_defId_1 │ 4096 │ +│ qPoolSpillReload0_defId_2 │ 4 │ +│ qSPIO0 │ 10824 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 14 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 19634 (0.000292569 GB) +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:53Z INFO 9029 (nc01/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2663-0_b3_grp_0_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 2 │ +│ I-2304-0_grp_1_sec_0_mhlo_exponential_6_b3_i0_sg0001 │ Internal │ bfloat16 │ 2 │ +│ I-2304-0_b0_grp_0_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 2 │ +│ I-2304-0_b3_grp_1_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ input1 │ ExternalInput │ int32 │ 3 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ compare.2.1665_sg0001 │ Internal │ int32 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 297 │ +└──────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9029 (nc01/sgLnk) [Codegen]: dma_desc_gen finished after 0.033 seconds +2025-11-04T21:38:54Z INFO 9029 (nc01/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 11905 │ +│ LDWEIGHTS │ 11905 │ +│ EVENT_SEMAPHORE │ 970 │ +│ COPY │ 788 │ +│ CAST │ 663 │ +│ UNKNOWN(0xd4) │ 533 │ +│ PSEUDO_DMA_TRIGGER │ 376 │ +│ POOL_BUFFER_LOAD │ 291 │ +│ GATHER │ 291 │ +│ ACTIVATE │ 229 │ +│ TENSOR_TENSOR │ 161 │ +│ UNKNOWN(0xd3) │ 145 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ MEMSET │ 60 │ +│ UNKNOWN(0xd8) │ 53 │ +│ TENSOR_SCALAR_ADDR │ 41 │ +│ UNKNOWN(0xda) │ 26 │ +│ TENSOR_REDUCE │ 24 │ +│ UNKNOWN(0x92) │ 24 │ +│ TENSOR_SCALAR │ 24 │ +│ STREAM_SHUFFLE │ 20 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ LOAD_MASK_SELECT │ 20 │ +│ RECIPROCAL │ 19 │ +│ ACT_TABLE_LOAD │ 16 │ +│ UNKNOWN(0x9b) │ 16 │ +│ UNKNOWN(0x24) │ 16 │ +│ UNKNOWN(0x9a) │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ MOVE │ 7 │ +│ UNKNOWN(0xe8) │ 5 │ +│ IOTA │ 2 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 1610 │ +│ Scalar │ 2476 │ +│ Tensor │ 23906 │ +│ SyncDMA │ 0 │ +│ Vector │ 1059 │ +│ Sync │ 190 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:54Z USER 9029 (nc00/sgLnk) [Codegen]: isa_gen finished after 0.301 seconds +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 602 │ +│ qDVESpillReload0_defId_2 │ 142 │ +│ qPoolSpillReload0_defId_0 │ 4096 │ +│ qPoolSpillReload0_defId_1 │ 4864 │ +│ qPoolSpillReload0_defId_2 │ 972 │ +│ qSPIO0 │ 10826 │ +│ qSPPIOParam0 │ 56 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 358 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 21918 (0.000326604 GB) +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qSPPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 144 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌───────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├───────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2663-0_b0_grp_1_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ get_tuple_element.7_sg0002 │ Internal │ float32 │ 3 │ +│ input1 │ ExternalInput │ int32 │ 3 │ +│ scatter.1_sg0002 │ Internal │ uint8 │ 3 │ +│ all-reduce.465.2305_sg0001 │ Internal │ bfloat16 │ 27 │ +│ compare.2.1665_sg0001 │ Internal │ int32 │ 27 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 298 │ +└───────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9029 (nc01/sgLnk) [Codegen]: debug_info_gen finished after 0.061 seconds +2025-11-04T21:38:54Z USER 9029 (nc00/sgLnk) [Codegen]: dma_desc_gen finished after 0.022 seconds +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:54Z USER 9029 (nc01/sgLnk) [ModuleForkPass]: codegen finished after 0.339 seconds +2025-11-04T21:38:54Z INFO 9029 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9029 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 3730 memory location(s), 4 block(s), and 15265 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9029 (nc00/sgLnk) [Codegen]: debug_info_gen finished after 0.060 seconds +2025-11-04T21:38:54Z USER 9029 (nc00/sgLnk) [ModuleForkPass]: codegen finished after 0.396 seconds +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 4123 memory location(s), 4 block(s), and 16172 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9029 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:54Z USER 9029 [BackendPassManager]: mod_parallel_pass finished after 0.399 seconds +2025-11-04T21:38:54Z INFO 9029 [BackendPassManager]: curr_vmrss: 370mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:54Z USER 9029 [BackendPassManager]: Running hbm_usage +2025-11-04T21:38:54Z INFO 9029 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [HBMUsage]: +┌───────────────┬───────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼───────────┼───────────────────┤ +│ Copy │ 1.156KB │ 21.312KB │ +│ CCE │ 168.000KB │ 24.000KB │ +│ Transpose │ 0.000B │ 128.000KB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 16.000KB │ 97.500KB │ +└───────────────┴───────────┴───────────────────┘ + +2025-11-04T21:38:54Z INFO 9029 (nc00/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.655GB │ +│ Model Code │ 1.785MB │ +│ Model Constants │ 481.012KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 10.133MB │ +│ DMA Ring IO │ 185.156KB │ +│ DMA Ring Spill │ 270.812KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9029 (nc01/sgLnk) [HBMUsage]: +┌───────────────┬───────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼───────────┼───────────────────┤ +│ Copy │ 1.125KB │ 9.656KB │ +│ CCE │ 168.000KB │ 0.000B │ +│ Transpose │ 0.000B │ 128.000KB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 15.750KB │ 81.250KB │ +└───────────────┴───────────┴───────────────────┘ + +2025-11-04T21:38:54Z INFO 9029 (nc01/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.651GB │ +│ Model Code │ 1.661MB │ +│ Model Constants │ 479.004KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.000MB │ +│ DMA Ring IO │ 184.875KB │ +│ DMA Ring Spill │ 218.906KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9029 [HBMUsage]: Total estimated HBM usage is: 3.663GB +2025-11-04T21:38:54Z USER 9029 [BackendPassManager]: hbm_usage finished after 0.003 seconds +2025-11-04T21:38:54Z INFO 9029 [BackendPassManager]: curr_vmrss: 370mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9029 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7853 memory location(s), 8 block(s), and 31437 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9029 [BackendPassManager]: Running neff_packager +2025-11-04T21:38:54Z INFO 9029 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=8 allocs=7853 blocks=8 instructions=31437 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1688_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1611-1690_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1621-1692_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2003_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1990_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1450-1550_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1458-1552_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1686_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.27-1070-1295_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1515_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1688_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1611-1690_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1621-1692_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2003_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1990_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1450-1550_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1458-1552_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1686_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1515_CRSM.npy +2025-11-04T21:38:54Z INFO 9029 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:54Z WARNING 9029 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f/metrics.json +2025-11-04T21:38:54Z WARNING 9029 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:38:54Z INFO 9029 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff +2025-11-04T21:38:54Z INFO 9029 [NeffFileWriter]: IR signature: 57fba6ac28d818a4c0038ce126b61119 for neff artifacts +2025-11-04T21:38:54Z USER 9029 [BackendPassManager]: neff_packager finished after 0.162 seconds +2025-11-04T21:38:54Z INFO 9029 [BackendPassManager]: curr_vmrss: 371mb, ru_maxrss: 467mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9029 [BackendPassManager]: Output has 2 module(s), 8 function(s), 7853 memory location(s), 8 block(s), and 31437 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9029 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.005371 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.005371 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local and shared │ 0.005859 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: shared │ 0.006836 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local and shared │ 0.003922 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.001003 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: shared │ 0.003269 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.005859 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.009899 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.054916 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg02 │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc01 │ sg02 │ Total size of allocated tensors: local │ 0.000977 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000977 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.009899 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:38:54Z INFO 9029 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ all_gather.1 │ bfloat16 │ 1 │ 1.000000 MB │ +│ reshape.16 │ bfloat16 │ 1 │ 0.500000 MB │ +│ reshape.24 │ bfloat16 │ 1 │ 0.500000 MB │ +│ reshape.29 │ bfloat16 │ 1 │ 0.500000 MB │ +│ transpose.1 │ bfloat16 │ 1 │ 0.500000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:54Z INFO 9029 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg02, addr_space=local (complete data located at nc00/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.53 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:54Z INFO 9029 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc00 (complete data located at nc00//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.062500 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.062500 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:54Z INFO 9029 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg02, addr_space=local (complete data located at nc01/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.53 │ bfloat16 │ 1 │ 0.000001 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:54Z INFO 9029 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc01 (complete data located at nc01//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.062500 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.062500 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:54Z INFO 9029 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:38:54Z INFO 8522 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:38:54Z INFO 8522 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:38:54Z INFO 8522 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f +2025-11-04T21:38:54Z INFO 8522 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:38:54Z INFO 8522 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:38:54Z INFO 8522 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:38:54Z INFO 8522 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:38:54Z INFO 8522 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:38:54Z INFO 8522 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk1/neuronxcc-7_h2pq5f/hlo_netlist.json +2025-11-04T21:38:54Z INFO 8522 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:38:54Z INFO 8522 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:38:54Z INFO 8522 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:38:54Z INFO 8502 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk1/metaneff.pb b/context_encoding_model/_tp0_bk1/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..ca859bdff8c1acde34e47998748e89a41f5c81e0 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:456dc08330072407208f8e4a41b70cc9190b30d05dced01f768e2bbc43e5076d +size 2438380 diff --git a/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb b/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..842ec50cdcb8a6de982540beaceb779e448bcc2a --- /dev/null +++ b/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4a7565239b86e91fc95d8ad2ceb0bdd0fa2489c90c536cf87cd40f007ac5d60 +size 2525166 diff --git a/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff b/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff new file mode 100644 index 0000000000000000000000000000000000000000..69e2489e54b9eb80c754175cdcb9058f306219bb --- /dev/null +++ b/context_encoding_model/_tp0_bk1/model.MODULE_2330bfb0632c950ddab1+62ecd68b.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b96dea22dba97fdfefb2f26f7ad03c509af0a395c08e4bfb143ff14bd673c826 +size 1229824 diff --git a/context_encoding_model/_tp0_bk1/neuron_config.json b/context_encoding_model/_tp0_bk1/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..040c07e8da9246c5bb0851092f91e8b8c7522a23 --- /dev/null +++ b/context_encoding_model/_tp0_bk1/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 256 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 256 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 4096, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk2/command.txt b/context_encoding_model/_tp0_bk2/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4fa8325bd360e0e10c3ec05ff46b64669c99bd2 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb --output model.MODULE_49bb42f69f5b159ae769+3467f95e.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk2/compile_flags.MODULE_49bb42f69f5b159ae769+3467f95e.json b/context_encoding_model/_tp0_bk2/compile_flags.MODULE_49bb42f69f5b159ae769+3467f95e.json new file mode 100644 index 0000000000000000000000000000000000000000..36118ef137cccae61a76cf9cf0ccf4b3a252c813 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/compile_flags.MODULE_49bb42f69f5b159ae769+3467f95e.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk2/global_metric_store.json b/context_encoding_model/_tp0_bk2/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..808208eaa9913ceef22e3ea5fe9f55dd9055a31d --- /dev/null +++ b/context_encoding_model/_tp0_bk2/global_metric_store.json @@ -0,0 +1,1177 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.77135467529297, + "StaticProfiler::AveragePartitionUtilization": 94.32398223876953, + "StaticProfiler::AveragePeUtilization": 96.75625610351563, + "StaticProfiler::LocalizationEfficiency": 86.58112335205078, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.48306274414063, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.07081985473632813, + "AffinePredicateResolution": 0.001847982406616211, + "AliasDependencyElimination": 0.0017039775848388672, + "AliasDependencyInduction": 0.016176223754882813, + "AliasDependencyReset": 0.0533907413482666, + "BFComputeCutting": 0.002690553665161133, + "BirCodeGenLoop": 0.436786413192749, + "CCOpFusion": 0.05509161949157715, + "CanonicalizeConv": 2.099999983329326e-05, + "CanonicalizeDAGForPGTiling": 0.01196432113647461, + "CanonicalizeForTensorizer": 4.0000002627493814e-05, + "CanonicalizeIR": 0.002866029739379883, + "Canonicalizer": 0.000770999991800636, + "CoalesceCCOp": 0.02091670036315918, + "CommuteConcat": 0.0016961097717285156, + "DMALocalityOpt": 0.012746095657348633, + "DMAProfiler": 0.025209903717041016, + "DMATilingProfiler": 0.013326406478881836, + "DataLocalityOpt": 0.13399314880371094, + "DataStreaming": 0.02252793312072754, + "DeConcat": 0.003023386001586914, + "DeadCodeElimination": 0.006216287612915039, + "DeadStoreElimination": 0.01400136947631836, + "DelinearIndices": 0.014129638671875, + "Delinearization": 0.004580259323120117, + "DelinearizeSPMD": 0.02204442024230957, + "DoNothing": 0.0005753040313720703, + "DramToDramTranspose": 0.0199737548828125, + "DumpGraphAndMetadata": 0.037271738052368164, + "EliminateDivs": 0.0025110244750976563, + "ExpandBatchNorm": 0.002251148223876953, + "ExpandISAMacro": 0.012173652648925781, + "FactorizeBlkDims": 0.041153669357299805, + "FactorizeThreadAxesInFreeDims": 0.0031156539916992188, + "FlattenMacroLoop": 0.005499601364135742, + "GenericAccessSimplifier": 0.004717350006103516, + "HoistCompute": 6.999999641266186e-06, + "IdentifyCrossPassTensors": 4.70000013592653e-05, + "InferInitValue": 0.046659231185913086, + "InferIntrinsicOnCC": 0.039793968200683594, + "InferNeuronTensor": 0.03774452209472656, + "InferNonlocalTensors": 0.030941486358642578, + "InferPSumTensor": 0.12924981117248535, + "InferShardAxis": 0.504509449005127, + "InferSharedMemLoc": 0.03389143943786621, + "InlineNativeKernels": 0.00193023681640625, + "InsertCoreBarrier": 0.019978046417236328, + "InsertIOTransposes": 0.061508893966674805, + "InsertImplicitShardAxisBeforeISel": 0.01612401008605957, + "InsertLocalTransposes": 0.005467414855957031, + "InsertOffloadedTransposes": 0.025030136108398438, + "LICM": 0.010097026824951172, + "LateLegalizeInst": 0.033937692642211914, + "LateLegalizePostSplit": 0.020189762115478516, + "LateLowerReshapeOp": 0.0018696784973144531, + "LateLowerTensorOp": 0.0022716522216796875, + "LateNeuronInstComb": 0.060944557189941406, + "LayoutPreprocessing": 0.05716848373413086, + "LayoutPreprocessingAndAnalysis": 0.12559008598327637, + "LayoutRequirementAnalysis": 0.01263284683227539, + "LegalizeCCOpLayout": 0.003709077835083008, + "LegalizeOpLevelAlias": 0.0016541481018066406, + "LegalizePartitionReduce": 0.007805347442626953, + "LegalizeSundaAccess": 0.13506388664245605, + "LegalizeSundaMacro": 0.020558595657348633, + "LegalizeType": 0.04366302490234375, + "LocalLayoutOpt": 0.04371356964111328, + "LoopFusion": 0.03305792808532715, + "LoopSplitting": 0.0017974376678466797, + "LowerBroadcast": 0.015467643737792969, + "LowerCCOpBlockAxis": 0.013673782348632813, + "LowerComplexBroadcast": 0.005238771438598633, + "LowerIntrinsics": 0.059927940368652344, + "LowerShardAxis": 0.02148151397705078, + "LowerTensorOp": 0.011847496032714844, + "LowerToSendRecv": 0.03099536895751953, + "LowerTranspose": 0.026517152786254883, + "MacroGeneration": 0.11886835098266602, + "MaskPropagation": 0.01356053352355957, + "MemcastMotion": 1.799999881768599e-05, + "MemcpyElimination": 0.050164222717285156, + "MutateDataType": 0.0028362274169921875, + "NeuronAliasDependencyInduction": 0.0024106502532958984, + "NeuronAliasDependencyReset": 0.07959818840026855, + "NeuronInstComb": 0.05623912811279297, + "NeuronLICM": 0.06090664863586426, + "NeuronLoopFusion": 0.0700373649597168, + "NeuronLoopInterchange": 0.003496885299682617, + "NeuronSimplifier": 0.0175168514251709, + "NeuronSimplifyPredicates": 0.035622596740722656, + "NeuronValueNumbering": 0.02324056625366211, + "OptimizeAliasedCopyChain": 0.0008881092071533203, + "OptimizeNKIKernels": 4.497897148132324, + "PAGLayoutOpt": 0.11170005798339844, + "PComputeCutting": 0.02699899673461914, + "PGLayoutTilingPipeline": 1.7730352878570557, + "PGTiling": 0.4928562641143799, + "PadElimination": 0.0005004405975341797, + "ParAxesAnnotation": 0.08141517639160156, + "PartialLoopFusion": 0.05184769630432129, + "PartialSimdFusion": 0.019034385681152344, + "PenguinizeFunctions": 3.7000001611886546e-05, + "PerfectLoopNest": 0.005218982696533203, + "PruneFunctions": 3.7999998312443495e-05, + "RecognizeOpIdiom": 0.028120994567871094, + "Recompute": 0.0006320476531982422, + "RelaxPredicates": 0.012555122375488281, + "Rematerialization": 0.002846240997314453, + "RemoveOptimizationBarriers": 8.199999865610152e-05, + "RemoveShardedPartitionAxes": 0.028553009033203125, + "ReshapeWeights": 0.0013833045959472656, + "ResolveAccessConflict": 0.007452726364135742, + "ResolveComplicatePredicates": 0.002027273178100586, + "RewriteReplicationMatmul": 0.0019905567169189453, + "RewriteWeights": 0.005997419357299805, + "SFKVectorizer": 0.2772505283355713, + "ScatterMotion": 2.300000051036477e-05, + "ShardingPropagationAnalysis": 0.11750531196594238, + "SimpleAllReduceTiling": 0.02184891700744629, + "Simplifier": 0.01620769500732422, + "SimplifyMacroPredicates": 0.03200030326843262, + "SimplifyNeuronTensor": 0.09968447685241699, + "SimplifySlice": 0.002093076705932617, + "SimplifyTensor": 0.01188349723815918, + "SpillPSum": 0.06837248802185059, + "SplitAPUnionSets": 0.09830927848815918, + "SplitAccGrp": 0.003184795379638672, + "StaticProfiler": 0.024499177932739258, + "StaticTransposeLocalTensor": 0.013921499252319336, + "SundaISel": 0.12911200523376465, + "TCTransform": 0.01076197624206543, + "TensorInitialization": 0.015585660934448242, + "TensorOpSimplifier": 0.009182214736938477, + "TensorOpTransform": 0.02479076385498047, + "TensorizerLegalizationPass": 4.5000000682193786e-05, + "TileCCOps": 0.01529073715209961, + "TilingProfiler": 0.02448558807373047, + "TransformConvOp": 0.0032668113708496094, + "TritiumFusion": 0.07947993278503418, + "ValueNumbering": 0.008611917495727539, + "VectorizeDMA": 0.008882284164428711, + "VectorizeMatMult": 0.013601303100585938, + "VerifySupportedOps": 3.199999991920777e-05, + "WeightCoalescing": 0.014402627944946289, + "ZeroSizeTensorElimination": 0.00017452239990234375, + "algsimp": 0.001744000008329749, + "batchnorm_expander": 3.5000000934815034e-05, + "boundary-marker-removal": 1.1000000085914508e-05, + "call-inliner": 0.00022499999613501132, + "canonicalize-boundary-marker": 1.2999999853491317e-05, + "collective-stream-id-checker": 6.0999998822808266e-05, + "comparison-expander": 0.0004409999819472432, + "computation-deduplicator": 5.299999611452222e-05, + "config-lowering": 9.800000407267362e-05, + "constant-statistics": 0.0003980000037699938, + "constant_folding": 0.00015499998698942363, + "cse": 3.199999991920777e-05, + "dce": 4.099999932805076e-05, + "dot_decomposer": 0.0008870000019669533, + "dynamic-slice-transpose": 1.2000000424450263e-05, + "eliminate-redundant-compare": 0.0001379999885102734, + "emit-offloaded-dropout": 3.400000059627928e-05, + "flatten-call-graph": 0.0006670000148005784, + "fuse-send-recv": 5.299999611452222e-05, + "hilo-conditional-to-select": 1.2000000424450263e-05, + "hilo::LegalizeAlias": 1.1999999514955562e-05, + "hilo::NeuronInstCombine": 0.00015300000086426735, + "hilo::NeuronOpFusion": 2.9999999242136255e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 3.099999958067201e-05, + "hilo::ScheduleFusion": 5.999999757477781e-06, + "hilo::SixtyFourHack": 6.500000017695129e-05, + "hilo::VerifyAliasing": 4.999999873689376e-06, + "hlo-mac-count": 0.01228100061416626, + "instruction-histogram": 0.0007319999858736992, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0008159999852068722, + "io-statistics": 3.899999865097925e-05, + "legalize-ccops-for-tensorizer": 3.000000106112566e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 9.000000318337698e-06, + "map-inline": 0.0007249999907799065, + "metadata-naming": 4.400000034365803e-05, + "mlir::detail::OpToOpPassAdaptor": 6.800000119255856e-05, + "mlir::hlo::MhloToPyPenguin": 0.008609999902546406, + "mlir::mhlo::LowerComplexExtraPass": 0.000291000003926456, + "mlir::mhlo::LowerComplexPass": 0.0005230000242590904, + "native-to-custom-softmax": 0.0003209999995306134, + "native-to-custom-softmax-dx": 0.0004980000085197389, + "neuron-hlo-verifier": 0.010431000031530857, + "operand_upcaster": 4.400000034365803e-05, + "opt-barrier-removal": 0.0002589999930933118, + "post-par-pipe-begin": 5.999999757477781e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0013230000622570515, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.06850799918174744, + "replace-minimum-constant": 0.00036299999919719994, + "reshape-mover": 5.500000042957254e-05, + "simplify-concat": 0.00010000000474974513, + "simplify-while-loops": 5.0000002374872565e-05, + "transform-variadic-reduce": 5.8999998145736754e-05, + "tuple-simplifier": 0.00014600000577047467, + "unpack-nested-aws-ntwsr": 0.0002479999966453761, + "unroll-while-loop": 7.999999979801942e-06, + "zero_sized_hlo_elimination": 0.0007040000054985285 + }, + "hilo": { + "ConstantSize": 926335.0, + "HloInputCount": 371.0, + "HloMacCount": 26463305728.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910916096.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 886427776.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 22051.0, + "StaticProfiler::AifUb": 173.52798461914063, + "StaticProfiler::ArithmeticIntensityTensorizer": 150.2424774169922, + "StaticProfiler::AverageDmaLength": 2589.193359375, + "StaticProfiler::DDRTransferBytes": 407886880.0, + "StaticProfiler::InternalTransferBytes": 327079712.0, + "StaticProfiler::LoadExpanded": 89436.0, + "StaticProfiler::StoreExpanded": 2154.0, + "StaticProfiler::TotalDMAExpanded": 91590.0, + "StaticProfiler::TotalDynamicInstancesCount": 26447.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 25996.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 11424.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 10291.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 786.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 164.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.0016029999824240804, + "call-inliner": 0.00019999999494757503, + "collective-stream-id-checker": 5.2999999752501026e-05, + "comparison-expander": 0.00042699999175965786, + "constant-statistics": 0.0003980000037699938, + "constant_folding": 0.0001340000017080456, + "dce": 3.7999998312443495e-05, + "dot_decomposer": 0.0008870000019669533, + "eliminate-redundant-compare": 0.0001289999927394092, + "flatten-call-graph": 0.0006440000142902136, + "hlo-mac-count": 0.007197000086307526, + "instruction-histogram": 0.0007319999858736992, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0008159999852068722, + "io-statistics": 3.899999865097925e-05, + "map-inline": 0.0006960000027902424, + "native-to-custom-softmax": 0.00030499999411404133, + "native-to-custom-softmax-dx": 0.00039000000106170774, + "neuron-hlo-verifier": 0.009362000040709972, + "opt-barrier-removal": 0.0002589999930933118, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.06850799918174744, + "replace-minimum-constant": 0.00034500000765547156, + "reshape-mover": 4.8999998398358e-05, + "simplify-while-loops": 4.400000034365803e-05, + "tuple-simplifier": 0.0001340000017080456, + "unpack-nested-aws-ntwsr": 0.00023799999326001853, + "unroll-while-loop": 7.999999979801942e-06, + "zero_sized_hlo_elimination": 0.0007040000054985285 + } + }, + "attention_isa_kernel": { + "compiletime": { + "CoalesceCCOp": 0.00021982192993164063, + "DMALocalityOpt": 0.00021767616271972656, + "DMAProfiler": 0.0002532005310058594, + "DataStreaming": 0.00019359588623046875, + "DoNothing": 0.00017213821411132813, + "ExpandISAMacro": 0.00021219253540039063, + "FactorizeBlkDims": 0.0016205310821533203, + "InferPSumTensor": 0.00067901611328125, + "InferSharedMemLoc": 0.0005524158477783203, + "InsertCoreBarrier": 0.00033855438232421875, + "LateLegalizeInst": 0.00021457672119140625, + "LateNeuronInstComb": 0.00042700767517089844, + "LegalizeSundaAccess": 0.00022602081298828125, + "LegalizeType": 0.00026869773864746094, + "LowerBroadcast": 0.0002257823944091797, + "LowerIntrinsics": 0.0002770423889160156, + "LowerTranspose": 0.0002372264862060547, + "NeuronInstComb": 0.0004298686981201172, + "NeuronLICM": 0.00019097328186035156, + "NeuronSimplifyPredicates": 0.00029349327087402344, + "NeuronValueNumbering": 0.00023818016052246094, + "SFKVectorizer": 0.0022597312927246094, + "SimpleAllReduceTiling": 0.00019431114196777344, + "SimplifyNeuronTensor": 0.0004868507385253906, + "SpillPSum": 0.0006351470947265625, + "WeightCoalescing": 0.00022172927856445313 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0003490447998046875, + "DMALocalityOpt": 0.00027871131896972656, + "DMAProfiler": 0.0013451576232910156, + "DataStreaming": 0.00047016143798828125, + "DoNothing": 0.0002353191375732422, + "ExpandISAMacro": 0.0008096694946289063, + "FactorizeBlkDims": 0.0007121562957763672, + "InferPSumTensor": 0.0026960372924804688, + "InferSharedMemLoc": 0.0007166862487792969, + "InsertCoreBarrier": 0.0004069805145263672, + "LateLegalizeInst": 0.0005886554718017578, + "LateNeuronInstComb": 0.002978801727294922, + "LegalizeSundaAccess": 0.003289461135864258, + "LegalizeType": 0.00041961669921875, + "LowerBroadcast": 0.0004119873046875, + "LowerIntrinsics": 0.0003657341003417969, + "LowerTranspose": 0.0004086494445800781, + "NeuronInstComb": 0.0012252330780029297, + "NeuronLICM": 0.0016541481018066406, + "NeuronSimplifyPredicates": 0.003880739212036133, + "NeuronValueNumbering": 0.0015976428985595703, + "SFKVectorizer": 0.005974292755126953, + "SimpleAllReduceTiling": 0.0007178783416748047, + "SimplifyNeuronTensor": 0.001119852066040039, + "SpillPSum": 0.003050565719604492, + "WeightCoalescing": 0.004181385040283203 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 1.1000000085914508e-05, + "CanonicalizeForTensorizer": 1.4000000192027073e-05, + "Canonicalizer": 0.00028899998869746923, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 1.5999999959603883e-05, + "MemcastMotion": 9.999999747378752e-06, + "PenguinizeFunctions": 1.4000000192027073e-05, + "PruneFunctions": 1.4000000192027073e-05, + "RemoveOptimizationBarriers": 2.099999983329326e-05, + "ScatterMotion": 9.000000318337698e-06, + "TensorizerLegalizationPass": 2.2000000171829015e-05, + "VerifySupportedOps": 9.999999747378752e-06, + "algsimp": 4.8000001697801054e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.000000096013537e-06, + "canonicalize-boundary-marker": 3.999999989900971e-06, + "collective-stream-id-checker": 1.9999999949504854e-06, + "comparison-expander": 3.999999989900971e-06, + "computation-deduplicator": 1.4999999621068127e-05, + "config-lowering": 3.400000059627928e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 9.999999747378752e-06, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.2000000424450263e-05, + "flatten-call-graph": 7.000000096013537e-06, + "fuse-send-recv": 1.700000029813964e-05, + "hilo-conditional-to-select": 3.000000106112566e-06, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 5.700000110664405e-05, + "hilo::NeuronOpFusion": 1.4000000192027073e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.2000000424450263e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.2000000424450263e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 8.499999967170879e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 9.000000318337698e-06, + "metadata-naming": 1.2000000424450263e-05, + "mlir::detail::OpToOpPassAdaptor": 2.300000051036477e-05, + "mlir::hlo::MhloToPyPenguin": 0.0016840000171214342, + "mlir::mhlo::LowerComplexExtraPass": 7.699999696342275e-05, + "mlir::mhlo::LowerComplexPass": 0.0001720000000204891, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 7.200000254670158e-05, + "neuron-hlo-verifier": 0.000371000001905486, + "operand_upcaster": 1.4000000192027073e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.00043399998685345054, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 1.9999999949504854e-06, + "simplify-concat": 3.300000025774352e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.000000096013537e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 17.4229793548584, + "ConstantSize": 926335.0, + "HloInputCount": 371.0, + "HloMacCount": 3489660928.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910916096.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 400581408.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.07508444786071777, + "AffinePredicateResolution": 0.0010340213775634766, + "AliasDependencyElimination": 0.0002384185791015625, + "AliasDependencyInduction": 0.007371425628662109, + "AliasDependencyReset": 0.0582888126373291, + "BFComputeCutting": 0.013819217681884766, + "BirCodeGenLoop": 0.06449317932128906, + "CCOpFusion": 0.04928326606750488, + "CanonicalizeDAGForPGTiling": 0.0076160430908203125, + "CanonicalizeIR": 0.0027213096618652344, + "CoalesceCCOp": 0.007978439331054688, + "CommuteConcat": 0.002101421356201172, + "DMALocalityOpt": 0.005911350250244141, + "DMAProfiler": 0.011723995208740234, + "DMATilingProfiler": 0.0077321529388427734, + "DataLocalityOpt": 0.20074963569641113, + "DataStreaming": 0.012155294418334961, + "DeConcat": 0.00474858283996582, + "DeadCodeElimination": 0.002126932144165039, + "DeadStoreElimination": 0.044701576232910156, + "DelinearIndices": 0.019860267639160156, + "Delinearization": 0.006117343902587891, + "DelinearizeSPMD": 0.04185628890991211, + "DoNothing": 9.918212890625e-05, + "DramToDramTranspose": 0.017105817794799805, + "DumpGraphAndMetadata": 0.0168914794921875, + "EliminateDivs": 0.0026845932006835938, + "ExpandBatchNorm": 0.0020225048065185547, + "ExpandISAMacro": 0.007347822189331055, + "FactorizeBlkDims": 0.05445575714111328, + "FactorizeThreadAxesInFreeDims": 0.004782199859619141, + "FlattenMacroLoop": 0.012040138244628906, + "GenericAccessSimplifier": 0.001428365707397461, + "InferInitValue": 0.08275437355041504, + "InferIntrinsicOnCC": 0.016964197158813477, + "InferNeuronTensor": 0.0713052749633789, + "InferNonlocalTensors": 0.17369747161865234, + "InferPSumTensor": 0.07679295539855957, + "InferShardAxis": 0.5430936813354492, + "InferSharedMemLoc": 0.0051038265228271484, + "InlineNativeKernels": 0.005239963531494141, + "InsertCoreBarrier": 0.008324384689331055, + "InsertIOTransposes": 0.038658857345581055, + "InsertImplicitShardAxisBeforeISel": 0.009135007858276367, + "InsertLocalTransposes": 0.029627084732055664, + "InsertOffloadedTransposes": 0.019885540008544922, + "LICM": 0.0056383609771728516, + "LateLegalizeInst": 0.011803150177001953, + "LateLegalizePostSplit": 0.005868196487426758, + "LateLowerReshapeOp": 0.007382631301879883, + "LateLowerTensorOp": 0.004155397415161133, + "LateNeuronInstComb": 0.0334017276763916, + "LayoutPreprocessing": 0.25243687629699707, + "LayoutPreprocessingAndAnalysis": 0.30139756202697754, + "LayoutRequirementAnalysis": 0.014056921005249023, + "LegalizeCCOpLayout": 0.0020928382873535156, + "LegalizeOpLevelAlias": 0.0016238689422607422, + "LegalizePartitionReduce": 0.0030252933502197266, + "LegalizeSundaAccess": 0.05711483955383301, + "LegalizeSundaMacro": 0.023845911026000977, + "LegalizeType": 0.00843501091003418, + "LocalLayoutOpt": 0.11445784568786621, + "LoopFusion": 0.01024007797241211, + "LoopSplitting": 0.0017781257629394531, + "LowerBroadcast": 0.0037119388580322266, + "LowerCCOpBlockAxis": 0.014172077178955078, + "LowerComplexBroadcast": 0.004027366638183594, + "LowerIntrinsics": 0.03793048858642578, + "LowerShardAxis": 0.012651443481445313, + "LowerTensorOp": 0.01001119613647461, + "LowerToSendRecv": 0.005930900573730469, + "LowerTranspose": 0.018492937088012695, + "MacroGeneration": 0.11934685707092285, + "MaskPropagation": 0.005895137786865234, + "MemcpyElimination": 0.09257030487060547, + "MutateDataType": 0.0017631053924560547, + "NeuronAliasDependencyInduction": 0.0007777214050292969, + "NeuronAliasDependencyReset": 0.03222823143005371, + "NeuronInstComb": 0.02764892578125, + "NeuronLICM": 0.015506982803344727, + "NeuronLoopFusion": 0.0383763313293457, + "NeuronLoopInterchange": 0.010429620742797852, + "NeuronSimplifier": 0.033356666564941406, + "NeuronSimplifyPredicates": 0.006680965423583984, + "NeuronValueNumbering": 0.019241809844970703, + "OptimizeAliasedCopyChain": 0.0010235309600830078, + "OptimizeNKIKernels": 0.45916128158569336, + "PAGLayoutOpt": 0.7117609977722168, + "PComputeCutting": 0.020105838775634766, + "PGLayoutTilingPipeline": 2.928948163986206, + "PGTiling": 0.39027953147888184, + "PadElimination": 0.0007317066192626953, + "ParAxesAnnotation": 0.6492185592651367, + "PartialLoopFusion": 0.0445561408996582, + "PartialSimdFusion": 0.039563655853271484, + "PerfectLoopNest": 0.0034646987915039063, + "RecognizeOpIdiom": 0.016507387161254883, + "Recompute": 0.0003933906555175781, + "RelaxPredicates": 0.005345582962036133, + "Rematerialization": 0.005880117416381836, + "RemoveShardedPartitionAxes": 0.03753328323364258, + "ReshapeWeights": 0.002991914749145508, + "ResolveAccessConflict": 0.0245821475982666, + "ResolveComplicatePredicates": 0.0018818378448486328, + "RewriteReplicationMatmul": 0.0024051666259765625, + "RewriteWeights": 0.006072998046875, + "SFKVectorizer": 0.49936652183532715, + "ShardingPropagationAnalysis": 0.03256559371948242, + "SimpleAllReduceTiling": 0.0036296844482421875, + "Simplifier": 0.007125377655029297, + "SimplifyMacroPredicates": 0.02839207649230957, + "SimplifyNeuronTensor": 0.021625995635986328, + "SimplifySlice": 0.0024862289428710938, + "SimplifyTensor": 0.033231496810913086, + "SpillPSum": 0.034162282943725586, + "SplitAPUnionSets": 0.042994022369384766, + "SplitAccGrp": 0.00764918327331543, + "StaticProfiler": 0.008186817169189453, + "StaticTransposeLocalTensor": 0.007767438888549805, + "SundaISel": 0.05960273742675781, + "TCTransform": 0.00103759765625, + "TensorInitialization": 0.007684469223022461, + "TensorOpSimplifier": 0.006952047348022461, + "TensorOpTransform": 0.030390501022338867, + "TileCCOps": 0.006802797317504883, + "TilingProfiler": 0.040956735610961914, + "TransformConvOp": 0.0029840469360351563, + "TritiumFusion": 0.03676962852478027, + "ValueNumbering": 0.0034532546997070313, + "VectorizeDMA": 0.005709171295166016, + "VectorizeMatMult": 0.030527591705322266, + "WeightCoalescing": 0.0040700435638427734, + "ZeroSizeTensorElimination": 0.0002455711364746094 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 1174.0, + "StaticProfiler::AifUb": 16.874553680419922, + "StaticProfiler::ArithmeticIntensityTensorizer": 204.6156768798828, + "StaticProfiler::AverageDmaLength": 1413.5869140625, + "StaticProfiler::AverageFractalPeUtilization": 99.77033233642578, + "StaticProfiler::AveragePartitionUtilization": 99.01372528076172, + "StaticProfiler::AveragePeUtilization": 99.29181671142578, + "StaticProfiler::DDRTransferBytes": 38148616.0, + "StaticProfiler::InternalTransferBytes": 22941696.0, + "StaticProfiler::LoadExpanded": 12553.0, + "StaticProfiler::LocalizationEfficiency": 1212.5694580078125, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1809.3712158203125, + "StaticProfiler::StoreExpanded": 8193.0, + "StaticProfiler::TotalDMAExpanded": 20746.0, + "StaticProfiler::TotalDynamicInstancesCount": 1510.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 1506.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 40.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 644.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 2.0, + "TilingProfiler::NumPfTransposesForLocal": 2.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 209.0, + "TilingProfiler::PfTransposeInstructionsForIo": 65.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 48.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 96.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 136.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.08034706115722656, + "AffinePredicateResolution": 0.0021657943725585938, + "AliasDependencyElimination": 0.0002224445343017578, + "AliasDependencyInduction": 0.006604909896850586, + "AliasDependencyReset": 0.028621673583984375, + "BFComputeCutting": 0.006361484527587891, + "BirCodeGenLoop": 0.043970584869384766, + "CCOpFusion": 0.03917193412780762, + "CanonicalizeDAGForPGTiling": 0.015412569046020508, + "CanonicalizeIR": 0.0026285648345947266, + "CoalesceCCOp": 0.019171714782714844, + "CommuteConcat": 0.0022630691528320313, + "DMALocalityOpt": 0.0018835067749023438, + "DMAProfiler": 0.015621662139892578, + "DMATilingProfiler": 0.007387399673461914, + "DataLocalityOpt": 0.3166489601135254, + "DataStreaming": 0.008202552795410156, + "DeConcat": 0.0027625560760498047, + "DeadCodeElimination": 0.008514642715454102, + "DeadStoreElimination": 0.02995467185974121, + "DelinearIndices": 0.020328283309936523, + "Delinearization": 0.008889198303222656, + "DelinearizeSPMD": 0.025659799575805664, + "DoNothing": 9.298324584960938e-05, + "DramToDramTranspose": 0.013378381729125977, + "DumpGraphAndMetadata": 0.011143684387207031, + "EliminateDivs": 0.006491422653198242, + "ExpandBatchNorm": 0.0015842914581298828, + "ExpandISAMacro": 0.014866113662719727, + "FactorizeBlkDims": 0.02399158477783203, + "FactorizeThreadAxesInFreeDims": 0.008170843124389648, + "FlattenMacroLoop": 0.013584375381469727, + "GenericAccessSimplifier": 0.0016484260559082031, + "InferInitValue": 0.09902763366699219, + "InferIntrinsicOnCC": 0.05336475372314453, + "InferNeuronTensor": 0.0689244270324707, + "InferNonlocalTensors": 0.0623164176940918, + "InferPSumTensor": 0.06397223472595215, + "InferShardAxis": 0.7081491947174072, + "InferSharedMemLoc": 0.008078813552856445, + "InlineNativeKernels": 0.002736806869506836, + "InsertCoreBarrier": 0.008532524108886719, + "InsertIOTransposes": 0.04539895057678223, + "InsertImplicitShardAxisBeforeISel": 0.011088132858276367, + "InsertLocalTransposes": 0.008382081985473633, + "InsertOffloadedTransposes": 0.009244203567504883, + "LICM": 0.0059854984283447266, + "LateLegalizeInst": 0.012192487716674805, + "LateLegalizePostSplit": 0.004922151565551758, + "LateLowerReshapeOp": 0.0050048828125, + "LateLowerTensorOp": 0.00384521484375, + "LateNeuronInstComb": 0.01603221893310547, + "LayoutPreprocessing": 0.083892822265625, + "LayoutPreprocessingAndAnalysis": 0.14038705825805664, + "LayoutRequirementAnalysis": 0.026170969009399414, + "LegalizeCCOpLayout": 0.0018677711486816406, + "LegalizeOpLevelAlias": 0.0019845962524414063, + "LegalizePartitionReduce": 0.002770662307739258, + "LegalizeSundaAccess": 0.02824854850769043, + "LegalizeSundaMacro": 0.025277376174926758, + "LegalizeType": 0.005255222320556641, + "LocalLayoutOpt": 0.1487877368927002, + "LoopFusion": 0.009909629821777344, + "LoopSplitting": 0.004529237747192383, + "LowerBroadcast": 0.0027620792388916016, + "LowerCCOpBlockAxis": 0.012650728225708008, + "LowerComplexBroadcast": 0.015005111694335938, + "LowerIntrinsics": 0.03992509841918945, + "LowerShardAxis": 0.01078486442565918, + "LowerTensorOp": 0.010359048843383789, + "LowerToSendRecv": 0.010585546493530273, + "LowerTranspose": 0.024251461029052734, + "MacroGeneration": 0.17415404319763184, + "MaskPropagation": 0.009861946105957031, + "MemcpyElimination": 0.08973836898803711, + "MutateDataType": 0.0023250579833984375, + "NeuronAliasDependencyInduction": 0.0036211013793945313, + "NeuronAliasDependencyReset": 0.03322243690490723, + "NeuronInstComb": 0.027010679244995117, + "NeuronLICM": 0.014135122299194336, + "NeuronLoopFusion": 0.0790092945098877, + "NeuronLoopInterchange": 0.006104946136474609, + "NeuronSimplifier": 0.02999567985534668, + "NeuronSimplifyPredicates": 0.0038328170776367188, + "NeuronValueNumbering": 0.016868114471435547, + "OptimizeAliasedCopyChain": 0.0012192726135253906, + "OptimizeNKIKernels": 0.4351818561553955, + "PAGLayoutOpt": 0.3483104705810547, + "PComputeCutting": 0.02324676513671875, + "PGLayoutTilingPipeline": 2.0860910415649414, + "PGTiling": 0.4031491279602051, + "PadElimination": 0.000728607177734375, + "ParAxesAnnotation": 0.30509090423583984, + "PartialLoopFusion": 0.06583142280578613, + "PartialSimdFusion": 0.1207880973815918, + "PerfectLoopNest": 0.010277032852172852, + "RecognizeOpIdiom": 0.004372358322143555, + "Recompute": 0.00031304359436035156, + "RelaxPredicates": 0.005488395690917969, + "Rematerialization": 0.0020155906677246094, + "RemoveShardedPartitionAxes": 0.026065587997436523, + "ReshapeWeights": 0.0033690929412841797, + "ResolveAccessConflict": 0.011795282363891602, + "ResolveComplicatePredicates": 0.005822658538818359, + "RewriteReplicationMatmul": 0.004129886627197266, + "RewriteWeights": 0.012514114379882813, + "SFKVectorizer": 0.3114356994628906, + "ShardingPropagationAnalysis": 0.03329586982727051, + "SimpleAllReduceTiling": 0.003468751907348633, + "Simplifier": 0.007978200912475586, + "SimplifyMacroPredicates": 0.01414942741394043, + "SimplifyNeuronTensor": 0.018707275390625, + "SimplifySlice": 0.0030634403228759766, + "SimplifyTensor": 0.028036117553710938, + "SpillPSum": 0.02836132049560547, + "SplitAPUnionSets": 0.028769254684448242, + "SplitAccGrp": 0.002518892288208008, + "StaticProfiler": 0.012613058090209961, + "StaticTransposeLocalTensor": 0.014979124069213867, + "SundaISel": 0.06619906425476074, + "TCTransform": 0.0018546581268310547, + "TensorInitialization": 0.0047528743743896484, + "TensorOpSimplifier": 0.006958484649658203, + "TensorOpTransform": 0.0394289493560791, + "TileCCOps": 0.03006148338317871, + "TilingProfiler": 0.020921945571899414, + "TransformConvOp": 0.0030717849731445313, + "TritiumFusion": 0.10711383819580078, + "ValueNumbering": 0.002644777297973633, + "VectorizeDMA": 0.009524345397949219, + "VectorizeMatMult": 0.04689669609069824, + "WeightCoalescing": 0.004178285598754883, + "ZeroSizeTensorElimination": 0.00014138221740722656 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 3307.0, + "StaticProfiler::AifUb": 142.25091552734375, + "StaticProfiler::ArithmeticIntensityTensorizer": 232.9062957763672, + "StaticProfiler::AverageDmaLength": 3958.823974609375, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.65841674804688, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 118065160.0, + "StaticProfiler::InternalTransferBytes": 19660800.0, + "StaticProfiler::LoadExpanded": 17025.0, + "StaticProfiler::LocalizationEfficiency": 163.7292022705078, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 185.10040283203125, + "StaticProfiler::StoreExpanded": 7937.0, + "StaticProfiler::TotalDMAExpanded": 24962.0, + "StaticProfiler::TotalDynamicInstancesCount": 3517.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 3517.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 32.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 2560.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 2.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 232.0, + "TilingProfiler::PfTransposeInstructionsForIo": 72.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 32.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 128.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 211.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.07081985473632813, + "AffinePredicateResolution": 0.001847982406616211, + "AliasDependencyElimination": 0.0017039775848388672, + "AliasDependencyInduction": 0.016176223754882813, + "AliasDependencyReset": 0.0533907413482666, + "BFComputeCutting": 0.002690553665161133, + "BirCodeGenLoop": 0.436786413192749, + "CCOpFusion": 0.05509161949157715, + "CanonicalizeDAGForPGTiling": 0.01196432113647461, + "CanonicalizeIR": 0.002866029739379883, + "CoalesceCCOp": 0.00784611701965332, + "CommuteConcat": 0.0016961097717285156, + "DMALocalityOpt": 0.006368398666381836, + "DMAProfiler": 0.016033411026000977, + "DMATilingProfiler": 0.013326406478881836, + "DataLocalityOpt": 0.13399314880371094, + "DataStreaming": 0.005326271057128906, + "DeConcat": 0.003023386001586914, + "DeadCodeElimination": 0.006216287612915039, + "DeadStoreElimination": 0.01400136947631836, + "DelinearIndices": 0.014129638671875, + "Delinearization": 0.004580259323120117, + "DelinearizeSPMD": 0.02204442024230957, + "DoNothing": 6.771087646484375e-05, + "DramToDramTranspose": 0.0199737548828125, + "DumpGraphAndMetadata": 0.037271738052368164, + "EliminateDivs": 0.0025110244750976563, + "ExpandBatchNorm": 0.002251148223876953, + "ExpandISAMacro": 0.0057184696197509766, + "FactorizeBlkDims": 0.020665884017944336, + "FactorizeThreadAxesInFreeDims": 0.0031156539916992188, + "FlattenMacroLoop": 0.005499601364135742, + "GenericAccessSimplifier": 0.004717350006103516, + "InferInitValue": 0.046659231185913086, + "InferIntrinsicOnCC": 0.039793968200683594, + "InferNeuronTensor": 0.03774452209472656, + "InferNonlocalTensors": 0.030941486358642578, + "InferPSumTensor": 0.10350608825683594, + "InferShardAxis": 0.504509449005127, + "InferSharedMemLoc": 0.021315813064575195, + "InlineNativeKernels": 0.00193023681640625, + "InsertCoreBarrier": 0.008482217788696289, + "InsertIOTransposes": 0.061508893966674805, + "InsertImplicitShardAxisBeforeISel": 0.01612401008605957, + "InsertLocalTransposes": 0.005467414855957031, + "InsertOffloadedTransposes": 0.025030136108398438, + "LICM": 0.010097026824951172, + "LateLegalizeInst": 0.010406017303466797, + "LateLegalizePostSplit": 0.020189762115478516, + "LateLowerReshapeOp": 0.0018696784973144531, + "LateLowerTensorOp": 0.0022716522216796875, + "LateNeuronInstComb": 0.022235631942749023, + "LayoutPreprocessing": 0.05716848373413086, + "LayoutPreprocessingAndAnalysis": 0.12559008598327637, + "LayoutRequirementAnalysis": 0.01263284683227539, + "LegalizeCCOpLayout": 0.003709077835083008, + "LegalizeOpLevelAlias": 0.0016541481018066406, + "LegalizePartitionReduce": 0.007805347442626953, + "LegalizeSundaAccess": 0.09120893478393555, + "LegalizeSundaMacro": 0.020558595657348633, + "LegalizeType": 0.006526947021484375, + "LocalLayoutOpt": 0.04371356964111328, + "LoopFusion": 0.03305792808532715, + "LoopSplitting": 0.0017974376678466797, + "LowerBroadcast": 0.005987882614135742, + "LowerCCOpBlockAxis": 0.013673782348632813, + "LowerComplexBroadcast": 0.005238771438598633, + "LowerIntrinsics": 0.04390692710876465, + "LowerShardAxis": 0.02148151397705078, + "LowerTensorOp": 0.011847496032714844, + "LowerToSendRecv": 0.03099536895751953, + "LowerTranspose": 0.022028207778930664, + "MacroGeneration": 0.11886835098266602, + "MaskPropagation": 0.01356053352355957, + "MemcpyElimination": 0.050164222717285156, + "MutateDataType": 0.0028362274169921875, + "NeuronAliasDependencyInduction": 0.0024106502532958984, + "NeuronAliasDependencyReset": 0.07959818840026855, + "NeuronInstComb": 0.024571895599365234, + "NeuronLICM": 0.019634723663330078, + "NeuronLoopFusion": 0.0700373649597168, + "NeuronLoopInterchange": 0.003496885299682617, + "NeuronSimplifier": 0.0175168514251709, + "NeuronSimplifyPredicates": 0.01945638656616211, + "NeuronValueNumbering": 0.014354467391967773, + "OptimizeAliasedCopyChain": 0.0008881092071533203, + "OptimizeNKIKernels": 4.497897148132324, + "PAGLayoutOpt": 0.11170005798339844, + "PComputeCutting": 0.02699899673461914, + "PGLayoutTilingPipeline": 1.7730352878570557, + "PGTiling": 0.4928562641143799, + "PadElimination": 0.0005004405975341797, + "ParAxesAnnotation": 0.08141517639160156, + "PartialLoopFusion": 0.05184769630432129, + "PartialSimdFusion": 0.019034385681152344, + "PerfectLoopNest": 0.005218982696533203, + "RecognizeOpIdiom": 0.028120994567871094, + "Recompute": 0.0006320476531982422, + "RelaxPredicates": 0.012555122375488281, + "Rematerialization": 0.002846240997314453, + "RemoveShardedPartitionAxes": 0.028553009033203125, + "ReshapeWeights": 0.0013833045959472656, + "ResolveAccessConflict": 0.007452726364135742, + "ResolveComplicatePredicates": 0.002027273178100586, + "RewriteReplicationMatmul": 0.0019905567169189453, + "RewriteWeights": 0.005997419357299805, + "SFKVectorizer": 0.20844674110412598, + "ShardingPropagationAnalysis": 0.11750531196594238, + "SimpleAllReduceTiling": 0.0042400360107421875, + "Simplifier": 0.01620769500732422, + "SimplifyMacroPredicates": 0.03200030326843262, + "SimplifyNeuronTensor": 0.016496896743774414, + "SimplifySlice": 0.002093076705932617, + "SimplifyTensor": 0.01188349723815918, + "SpillPSum": 0.019929170608520508, + "SplitAPUnionSets": 0.09830927848815918, + "SplitAccGrp": 0.003184795379638672, + "StaticProfiler": 0.024499177932739258, + "StaticTransposeLocalTensor": 0.013921499252319336, + "SundaISel": 0.12911200523376465, + "TCTransform": 0.01076197624206543, + "TensorInitialization": 0.015585660934448242, + "TensorOpSimplifier": 0.009182214736938477, + "TensorOpTransform": 0.02479076385498047, + "TileCCOps": 0.01529073715209961, + "TilingProfiler": 0.02448558807373047, + "TransformConvOp": 0.0032668113708496094, + "TritiumFusion": 0.07947993278503418, + "ValueNumbering": 0.008611917495727539, + "VectorizeDMA": 0.008882284164428711, + "VectorizeMatMult": 0.013601303100585938, + "WeightCoalescing": 0.0029730796813964844, + "ZeroSizeTensorElimination": 0.00017452239990234375 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 22051.0, + "StaticProfiler::AifUb": 173.52798461914063, + "StaticProfiler::ArithmeticIntensityTensorizer": 150.2424774169922, + "StaticProfiler::AverageDmaLength": 2589.193359375, + "StaticProfiler::AverageFractalPeUtilization": 98.77135467529297, + "StaticProfiler::AveragePartitionUtilization": 94.32398223876953, + "StaticProfiler::AveragePeUtilization": 96.75625610351563, + "StaticProfiler::DDRTransferBytes": 407886880.0, + "StaticProfiler::InternalTransferBytes": 327079712.0, + "StaticProfiler::LoadExpanded": 89436.0, + "StaticProfiler::LocalizationEfficiency": 86.58112335205078, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.48306274414063, + "StaticProfiler::StoreExpanded": 2154.0, + "StaticProfiler::TotalDMAExpanded": 91590.0, + "StaticProfiler::TotalDynamicInstancesCount": 26447.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 25996.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 11424.0, + "TilingProfiler::NumPfTransposes": 6.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 10291.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 786.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 4.0, + "TilingProfiler::SimdInstructionsAfterTiling": 164.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 9.999999974752427e-07, + "CanonicalizeForTensorizer": 1.4000000192027073e-05, + "Canonicalizer": 0.00020799999765586108, + "HoistCompute": 4.999999873689376e-06, + "IdentifyCrossPassTensors": 1.5999999959603883e-05, + "MemcastMotion": 7.000000096013537e-06, + "PenguinizeFunctions": 1.2000000424450263e-05, + "PruneFunctions": 1.5999999959603883e-05, + "RemoveOptimizationBarriers": 2.2000000171829015e-05, + "ScatterMotion": 1.1000000085914508e-05, + "TensorizerLegalizationPass": 1.700000029813964e-05, + "VerifySupportedOps": 9.999999747378752e-06, + "algsimp": 4.5000000682193786e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.999999979801942e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.8999999156221747e-05, + "config-lowering": 2.8000000384054147e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 9.999999747378752e-06, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.1000000085914508e-05, + "flatten-call-graph": 6.000000212225132e-06, + "fuse-send-recv": 1.8999999156221747e-05, + "hilo-conditional-to-select": 3.999999989900971e-06, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 5.199999941396527e-05, + "hilo::NeuronOpFusion": 1.1000000085914508e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 6.000000212225132e-06, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.2000000424450263e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 9.699999645818025e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 9.999999747378752e-06, + "metadata-naming": 1.8000000636675395e-05, + "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05, + "mlir::hlo::MhloToPyPenguin": 0.001829999964684248, + "mlir::mhlo::LowerComplexExtraPass": 0.00011999999696854502, + "mlir::mhlo::LowerComplexPass": 0.0001849999971454963, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.2000000424450263e-05, + "neuron-hlo-verifier": 0.00036700000055134296, + "operand_upcaster": 1.4000000192027073e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0004250000056345016, + "replace-minimum-constant": 4.999999873689376e-06, + "reshape-mover": 1.9999999949504854e-06, + "simplify-concat": 3.5000000934815034e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.000000096013537e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 201.87655639648438, + "HloMacCount": 13153337344.0, + "Traffic": 130310688.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 9.000000318337698e-06, + "CanonicalizeForTensorizer": 1.2000000424450263e-05, + "Canonicalizer": 0.0002739999908953905, + "HoistCompute": 0.0, + "IdentifyCrossPassTensors": 1.4999999621068127e-05, + "MemcastMotion": 9.999999974752427e-07, + "PenguinizeFunctions": 1.1000000085914508e-05, + "PruneFunctions": 7.999999979801942e-06, + "RemoveOptimizationBarriers": 3.899999865097925e-05, + "ScatterMotion": 3.000000106112566e-06, + "TensorizerLegalizationPass": 6.000000212225132e-06, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 4.8000001697801054e-05, + "batchnorm_expander": 1.1000000085914508e-05, + "boundary-marker-removal": 3.000000106112566e-06, + "call-inliner": 9.999999747378752e-06, + "canonicalize-boundary-marker": 3.999999989900971e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.8999999156221747e-05, + "config-lowering": 3.600000127335079e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.1000000085914508e-05, + "flatten-call-graph": 9.999999747378752e-06, + "fuse-send-recv": 1.700000029813964e-05, + "hilo-conditional-to-select": 4.999999873689376e-06, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 4.400000034365803e-05, + "hilo::NeuronOpFusion": 4.999999873689376e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05, + "hilo::ScheduleFusion": 3.999999989900971e-06, + "hilo::SixtyFourHack": 4.099999932805076e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.004902000073343515, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 9.999999747378752e-06, + "metadata-naming": 1.4000000192027073e-05, + "mlir::detail::OpToOpPassAdaptor": 2.499999936844688e-05, + "mlir::hlo::MhloToPyPenguin": 0.005096000153571367, + "mlir::mhlo::LowerComplexExtraPass": 9.40000027185306e-05, + "mlir::mhlo::LowerComplexPass": 0.00016599999798927456, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 2.4000000848900527e-05, + "neuron-hlo-verifier": 0.00033099998836405575, + "operand_upcaster": 1.5999999959603883e-05, + "post-par-pipe-begin": 3.999999989900971e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.00046400001156143844, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 1.9999999949504854e-06, + "simplify-concat": 3.199999991920777e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 4.5000000682193786e-05, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 55.24231719970703, + "HloMacCount": 9820307456.0, + "Traffic": 355535680.0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.012721538543701172, + "DMALocalityOpt": 0.00609898567199707, + "DMAProfiler": 0.007831335067749023, + "DataStreaming": 0.01673150062561035, + "DoNothing": 0.0002722740173339844, + "ExpandISAMacro": 0.0056455135345458984, + "FactorizeBlkDims": 0.0197756290435791, + "InferPSumTensor": 0.023047685623168945, + "InferSharedMemLoc": 0.011858940124511719, + "InsertCoreBarrier": 0.011088848114013672, + "LateLegalizeInst": 0.02294301986694336, + "LateNeuronInstComb": 0.03573012351989746, + "LegalizeSundaAccess": 0.04056549072265625, + "LegalizeType": 0.036716461181640625, + "LowerBroadcast": 0.009067773818969727, + "LowerIntrinsics": 0.0156552791595459, + "LowerTranspose": 0.004080295562744141, + "NeuronInstComb": 0.030441999435424805, + "NeuronLICM": 0.03961777687072754, + "NeuronSimplifyPredicates": 0.012285470962524414, + "NeuronValueNumbering": 0.007288455963134766, + "SFKVectorizer": 0.06282949447631836, + "SimpleAllReduceTiling": 0.016891002655029297, + "SimplifyNeuronTensor": 0.08206772804260254, + "SpillPSum": 0.045392751693725586, + "WeightCoalescing": 0.0072481632232666016 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk2/graph.neff b/context_encoding_model/_tp0_bk2/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..41be7b180370f733e71690de9d7626ef8e0b9564 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06ba2911f0e007b1f4ad7d888115d6589d3bf2b988bbc6b3bc84a1db0766bb48 +size 1342464 diff --git a/context_encoding_model/_tp0_bk2/log-neuron-cc.txt b/context_encoding_model/_tp0_bk2/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7786518571aeef79b436fd1402f91b5a26ef977 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/log-neuron-cc.txt @@ -0,0 +1,9555 @@ +2025-11-04T21:38:32Z INFO 8563 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:32Z INFO 8563 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:32Z INFO 8593 [root]: XLA detected +2025-11-04T21:38:32Z INFO 8593 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:32Z INFO 8593 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2 +2025-11-04T21:38:32Z INFO 8593 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:32Z INFO 8593 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:32Z INFO 8593 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:32Z INFO 8593 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:32Z INFO 8593 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:32Z INFO 8593 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:32Z INFO 8593 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:33Z INFO 8593 [job.HLOToTensorizer.0]: +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 8312 + reshape 1912 23.00% ################################################################ + broadcast 1123 13.51% ##################################### + transpose 1072 12.90% ################################### + convert 945 11.37% ############################### + constant 636 7.65% ##################### + parameter 371 4.46% ############ + slice 347 4.17% ########### + add 284 3.42% ######### + get-tuple-element 259 3.12% ######## + multiply 255 3.07% ######## + dot 198 2.38% ###### + call 174 2.09% ##### + compare 173 2.08% ##### + select 170 2.05% ##### + concatenate 116 1.40% ### + tuple 57 0.69% # + scatter 57 0.69% # + negate 56 0.67% # + all-reduce 56 0.67% # + divide 29 0.35% + gather 6 0.07% + iota 5 0.06% + all-gather 3 0.04% + reduce 3 0.04% + custom-call 2 0.02% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 5437 + reshape 1421 26.14% ################################################################ + transpose 817 15.03% #################################### + convert 720 13.24% ################################ + constant 443 8.15% ################### + parameter 371 6.82% ################ + broadcast 266 4.89% ########### + dot 197 3.62% ######## + custom-call 175 3.22% ####### + multiply 171 3.15% ####### + add 171 3.15% ####### + get-tuple-element 147 2.70% ###### + slice 115 2.12% ##### + concatenate 114 2.10% ##### + compare 59 1.09% ## + select 58 1.07% ## + scatter 57 1.05% ## + negate 56 1.03% ## + all-reduce 56 1.03% ## + gather 6 0.11% + all-gather 3 0.06% + iota 3 0.06% + reduce 3 0.06% + pad 2 0.04% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +Potential split-points stats: #CC 59 #AR 56 #AG 3 #BN 0 nClamp 0 +ModuleSplitter initial partitioning... #parts 59 +ModuleSplitter initial partitioning... Done. + 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 57 58 +New disjoint wave: start 2 len 54 NumReps: 27 macs 355140108288 +First non-zero-mac/used part from the end is 58 +Not enough zero-mac parts. skip +ModuleSplitter initial partitioning... #parts 29 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:33Z INFO 8593 [job.HLOToTensorizer.0]: IR signature: 9fe776a7afa96ef34790ea7eee8cdb29beb32a265e092214951966e7e1adb89e for sg0000/HLOToTensorizer +2025-11-04T21:38:33Z INFO 8593 [job.HLOToTensorizer.0]: IR signature: b7ea6e6325d997a8a394b3e7915f3d477b8d1f3f7559a53a698d0f3f370c3197 for sg0001/HLOToTensorizer +2025-11-04T21:38:33Z INFO 8593 [job.HLOToTensorizer.0]: IR signature: a518659ab084e1cc22c95d1472f2f05e4730de5a137d358888675ae1f7503339 for sg0002/HLOToTensorizer +2025-11-04T21:38:33Z INFO 8593 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:33Z INFO 8593 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:33Z INFO 8593 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:33Z INFO 8593 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:33Z INFO 8593 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:33Z INFO 8593 [job.Frontend.0]: Start model loading +2025-11-04T21:38:33Z INFO 8593 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:33Z INFO 8593 [job.Frontend.0]: Num jobs: 12 +2025-11-04T21:38:33Z USER 8593 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:33Z INFO 8593 [Tensorizer]: Max workers: 3 +2025-11-04T21:38:33Z INFO 8675 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-11-04T21:38:33Z INFO 8676 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-11-04T21:38:33Z INFO 8677 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-11-04T21:38:33Z INFO 8676 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8676 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8675 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8675 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8676 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8675 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.004 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.031 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8677 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.019 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.124 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.007 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.005 seconds +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.010 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.018 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.051 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.034 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.039 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.029 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.084 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.030 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.090 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.027 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.058 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.015 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.050 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.088 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.038 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.093 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.037 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.021 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.012 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.028 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.010 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8677 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8675 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.025 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.016 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.053 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.050 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.022 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.030 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.017 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.030 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.027 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.045 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8675 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2097152 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 512) %'all_gather.1' = AllGatherOp-34 AllGather_add(bfloat16 (1024, 512) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 512), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 15 | , id = 34 +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.030 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.033 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8676 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.030 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.084 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.026 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.103 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.028 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.024 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.055 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.031 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.004 seconds +2025-11-04T21:38:36Z INFO 8677 [Tensorizer]: After optimization: 39 statements +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.058 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.005 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.017 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.016 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-162 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6459 | hlo_id: 108 | , id = 162 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-178 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6596 | hlo_id: 117 | , id = 178 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.025 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.015 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.005 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.025 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.053 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.022 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.012 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.040 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.149 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.114 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.026 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.015 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.044 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.023 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.023 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.084 seconds +2025-11-04T21:38:36Z INFO 8676 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:36Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.026 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.140 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.057 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.126 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.062 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.018 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.031 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.252 seconds +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.301 seconds +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.081 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.112 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.022 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.174 seconds +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8675 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.305 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.118 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.348 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.026 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.033 seconds +2025-11-04T21:38:37Z INFO 8676 [sg0001/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 36 +total number of sharded dags: 12 + +total bytes transferred from input, output, non local tensors: 359736098 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 334551824 +% bytes transferred with 2x bandwidths: 93.00 + +NC0 FLOPs: 55340232218069614178 +NC1 FLOPs: 55340232218069605472 +% FLOPs sharded: 100.00 + + +Shard dim: 512, Number of dags: 6 +Matmuls sharded with this dim: +[512(s),2,6,2,128] @ [2,6,2,128,8,2,128] = [512(s),8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[512(s),2,8,128] @ [2,8,128,2,6,2,128] = [512(s),2,6,2,128] Number of occurrences: 2 + + +Shard dim: 256, Number of dags: 5 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[2,8,128] @ [2,8,128,75968(s)] = [75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.029 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.505 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8677 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.649 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.030 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(24, 'AG74'), (19, 'AG76'), (20, 'AG75')] +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG74'), (19, 'AG76'), (20, 'AG75')] +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 600 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG74'), (19, 'AG76'), (20, 'AG75')] +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG82'), (23, 'AG79'), (17, 'AG81'), (22, 'AG80')] +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(24, 'AG74'), (19, 'AG76'), (20, 'AG75')] +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(3, 'AG89'), (21, 'AG88'), (19, 'AG76'), (20, 'AG75')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.712 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 31 +total number of sharded dags: 24 + +total bytes transferred from input, output, non local tensors: 67642372 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 34082816 +% bytes transferred with 2x bandwidths: 50.39 + +NC0 FLOPs: 55340232221139992579 +NC1 FLOPs: 55340232221139992576 +% FLOPs sharded: 100.00 + + +Shard dim: 512, Number of dags: 23 +Matmuls sharded with this dim: +[512(s),2,6,2,128] @ [2,6,2,128,8,2,128] = [512(s),8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[512(s),2,8,128] @ [2,8,128,2,2,2,2,64] = [512(s),2,2,2,2,64] Number of occurrences: 1 +[512(s),2,8,128] @ [2,8,128,2,6,2,128] = [512(s),2,6,2,128] Number of occurrences: 2 +[512(s),2,8,128] @ [2,8,128,4,128] = [512(s),4,128] (stationary-streaming swapped) Number of occurrences: 1 +[512(s),2,8,128] @ [2,8,128,4,2,64] = [512(s),4,2,64] Number of occurrences: 1 + + +Shard dim: 2, Number of dags: 1 +Matmuls sharded with this dim: +[512,4,2,128] @ [4,2,128,2(s),2,4,128] = [512,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.071 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.042 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.020 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.033 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.026 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.708 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.027 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(31, 'AG107'), (27, 'AG109'), (29, 'AG108')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(31, 'AG107'), (27, 'AG109'), (29, 'AG108')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(31, 'AG107'), (27, 'AG109'), (29, 'AG108')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG115'), (30, 'AG112'), (25, 'AG114'), (28, 'AG113')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 659 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(31, 'AG107'), (27, 'AG109'), (29, 'AG108')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 660 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG109'), (31, 'AG107'), (29, 'AG108')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77|N|(64, 2) is not sorted, index list (w/ AG ids): [(13, 'AG119'), (9, 'AG120')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 4, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG109'), (31, 'AG107'), (29, 'AG108')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input75|N|(64, 2) is not sorted, index list (w/ AG ids): [(18, 'AG124'), (14, 'AG125')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 664 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 4, 128) is not sorted, index list (w/ AG ids): [(27, 'AG109'), (31, 'AG107'), (29, 'AG108')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(20, 'AG131'), (12, 'AG133'), (17, 'AG132')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 694 of IO tensor non_local bfloat16 %reshape.68(4, 2, 2, 64, 2, 256) is not sorted, index list (w/ AG ids): [(10, 'AG126'), (15, 'AG127'), (7, 'AG111'), (26, 'AG110')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 644 of IO tensor non_local bfloat16 %reshape.73(4, 2, 2, 256, 128) is not sorted, index list (w/ AG ids): [(11, 'AG129'), (16, 'AG130'), (7, 'AG111'), (19, 'AG128')] +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.080 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.119 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.493 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.023 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.062 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 31 +total number of sharded dags: 24 + +total bytes transferred from input, output, non local tensors: 26487814 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 18098176 +% bytes transferred with 2x bandwidths: 68.33 + +NC0 FLOPs: 3227762691 +NC1 FLOPs: 3227762688 +% FLOPs sharded: 99.99 + + +Shard dim: 512, Number of dags: 22 +Matmuls sharded with this dim: +[512(s),2,8,128] @ [2,8,128,2,2,2,2,64] = [512(s),2,2,2,2,64] Number of occurrences: 1 +[512(s),2,8,128] @ [2,8,128,4,128] = [512(s),4,128] (stationary-streaming swapped) Number of occurrences: 1 +[512(s),2,8,128] @ [2,8,128,4,2,64] = [512(s),4,2,64] Number of occurrences: 1 +[64] @ [512(s)] = [64,512(s)] Number of occurrences: 1 + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 2, Number of dags: 1 +Matmuls sharded with this dim: +[512,4,2,128] @ [4,2,128,2(s),2,4,128] = [512,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.025 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.020 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.773 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 384: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 48: simd128x256 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingBottleneck]: 1: simd1x1 +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.020 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.024 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.038 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.174 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.403 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.038 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.045 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.543 seconds +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8675 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.086 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 384: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 48: simd128x256 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 16: generic_store128x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 16: generic_store128x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 8: rmsnorm128x512x128 +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingBottleneck]: 8: simd64x512 +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.021 seconds +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8677 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.066 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.069 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.025 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.134 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x256 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x256 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 12: dma128x4096 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 8: dma128x1024 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma1x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma1x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(28, 'AG84'), (20, 'AG86'), (23, 'AG85')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (28, 'AG84'), (23, 'AG85')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 635 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|N|(64, 2) is not sorted, index list (w/ AG ids): [(24, 'AG89'), (21, 'AG92')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 4, 2, 64) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (28, 'AG84'), (23, 'AG85')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|N|(64, 2) is not sorted, index list (w/ AG ids): [(24, 'AG89'), (17, 'AG96')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 4, 128) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (28, 'AG84'), (23, 'AG85')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 419 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(27, 'AG102'), (22, 'AG104'), (25, 'AG103')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 631 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 2, 256) is not sorted, index list (w/ AG ids): [(20, 'AG86'), (23, 'AG85'), (1, 'AG88'), (26, 'AG87')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 520 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate0(2, 256, 2, 8, 128) is not sorted, index list (w/ AG ids): [(1, 'AG88'), (26, 'AG87'), (20, 'AG86'), (23, 'AG85')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 582 of IO tensor non_local bfloat16 %reshape.16(2, 2, 2, 2, 64, 2, 256) is not sorted, index list (w/ AG ids): [(7, 'AG95'), (12, 'AG94'), (16, 'AG93'), (21, 'AG92'), (24, 'AG89'), (1, 'AG88')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 676 of IO tensor non_local bfloat16 %reshape.24(4, 2, 2, 64, 2, 256) is not sorted, index list (w/ AG ids): [(8, 'AG97'), (13, 'AG98'), (17, 'AG96'), (24, 'AG89'), (1, 'AG88')] +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 614 of IO tensor non_local bfloat16 %reshape.29(4, 2, 2, 256, 128) is not sorted, index list (w/ AG ids): [(9, 'AG100'), (14, 'AG101'), (1, 'AG88'), (18, 'AG99')] +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.075 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.025 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.021 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.020 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.016 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.050 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.051 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.028 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.119 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.390 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.039 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.032 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.020 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.317 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x256 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.047 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.016 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.022 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LICM]: LICM finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.025 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.011 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.024 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.929 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 16: simd128x512 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 16: generic_store128x128 +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 16: generic_store128x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 8: indirect_load128x512 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 8: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x256 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x256 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x512 +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingBottleneck]: 8: simd128x128 +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.129 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.041 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.040 seconds +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8676 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.080 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:39Z INFO 8677 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.031 seconds +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:39Z INFO 8675 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.064 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.071 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.017 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.018 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.037 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.018 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.070 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.025 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.021 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.065 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.075 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.201 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x256 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x256 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x256 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x256x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x512 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: transpose_128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: generic_store128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: generic_store128x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: indirect_load128x512 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: rmsnorm128x512x128 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8: simd128x256 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.099 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.025 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.035 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.030 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.028 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.042 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.043 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.019 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.028 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.066 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.033 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:40Z INFO 8676 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.083 seconds +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.079 seconds +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:40Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.024 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.014 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.033 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.014 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.051 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.052 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.079 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.019 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.022 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.029 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.022 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.024 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.020 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.052 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.033 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.044 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.067 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.060 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.032 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.004 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.029 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.017 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.038 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.027 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.020 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.021 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.054 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.063 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.049 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.104 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.054 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:41Z INFO 8676 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.067 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8677 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.019 seconds +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.091 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.120 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.028 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.016 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.121 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.019 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.107 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.037 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.038 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.016 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.040 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.037 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.047 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.040 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.041 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.065 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.066 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.031 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.011 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.044 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.024 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.066 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.045 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.208 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.016 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.052 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.018 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.028 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.040 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.033 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 69.769% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'996.1588'[i31_0,4i31_1_0_0+i31_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i31_0,i0.128+512i31_1_0_0+128i31_1_0_1,i2.16,i1.128] # id=1587, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_996 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 193.732us (300.000KiB, est bw: 1.586GB/s, 8.875% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 2, 37984) %'convert.55'[0,i31_0,i0.128+512i31_1_0_0+128i31_1_0_1] = store float32<1 x 128> TongaSB partitions[2] float32 (2, 297, 1, 128) %'1013.1598'[i31_0,4i31_1_0_0+i31_1_0_1,0,i0.128] # id=1596, src_id=None, , instances=600 # dl = tensor_op_name: convert.55_pftranspose_1013 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 116.355us (24.000MiB, est bw: 216.285GB/s, 5.330% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 12, 512) %'input365_local_1080'[i16_0_1086,i15_0_0_0_1,i15_0_0_0_0,c1_1072,i0.128,i3.12,i1.128+128i2.2+256p_1698] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 12, 2, 128) %'input365'[i15_0_0_0_1+2i15_0_0_0_0,p_1698,c1_1072,i0.128,i3.12,i2.2,i1.128] # id=1362, src_id=None, , instances=32 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.128, i2.2, i3.12]] -> [[i0.128];[i1.128, i2.2, i3.12]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 110.541us (24.000MiB, est bw: 227.660GB/s, 5.064% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 2, 6, 128, 4096) %'input366_local_1057'[i11_0,i10_0_0_0,i10_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input366'[i10_0_0_0,i10_0_0_1,i0.128,i1.4096] # id=1353, src_id=None, , instances=24 # dl = tensor_op_name: _dot.197 | hlo_id: 52 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 110.541us (24.000MiB, est bw: 227.660GB/s, 5.064% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 2, 6, 128, 4096) %'input368_local_1068'[i16_0_1086,i12_0_0_0,i12_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input368'[i12_0_0_0,i12_0_0_1,i0.128,i1.4096] # id=1356, src_id=None, , instances=24 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 0.477% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'1000.1672'[i11_0,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (2, 256, 2048) %'add.9'[i11_0,i0.128+128i2.2,i1.2048] # id=1562, src_id=None, , instances=2 # dl = tensor_op_name: add.9_pftranspose_1000 | hlo_id: 27 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 0.477% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'_reload_1511'[i16_0_1086,i0.128,i1.4096] = load bfloat16<128 x 4096> DRAM3DBlk partitions[1] bfloat16 (2, 128, 4096) %'_spill_1508'[i16_0_1086,i0.128,i1.4096] # id=1513, src_id=None, , instances=2 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 0.477% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'1004.1677'[T_i20_0_1012,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (1048576,) %'all_reduce.3-buffer-2056'[524288T_i20_0_1012+2048i0.128+i1.2048+262144i2.2] # id=1571, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.3_pftranspose_1004 | hlo_id: 66 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 6.737us (2.000MiB, est bw: 311.309GB/s, 0.309% of tot. time) for bfloat16<128 x 4096> DRAM3DBlk partitions[1] bfloat16 (2, 128, 4096) %'_spill_1508'[i11_0,i0.128,i1.4096] = store bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %1023[i11_0,i0.128,i1.4096] # id=1510, src_id=None, , instances=2 # dl = tensor_op_name: _custom-call.348 | hlo_id: 34 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 6.737us (2.000MiB, est bw: 311.309GB/s, 0.309% of tot. time) for bfloat16<128 x 4096> non_local bfloat16 (1048576,) %'dot.14-buffer-2054'[524288i16_0_1086+2048i0.128+i1.2048+262144i2.2] = store bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %1087[i16_0_1086,i0.128,i2.2,i1.2048] # id=1365, src_id=None, , instances=2 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.034 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.030 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.016 seconds +2025-11-04T21:38:42Z INFO 8677 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:42Z INFO 8677 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:42Z INFO 8677 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.064 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.038 seconds +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:42Z INFO 8675 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:42Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.011 seconds +2025-11-04T21:38:42Z INFO 8677 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:42Z INFO 8676 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.021 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.028 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.031 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.016 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.016 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.032 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.077 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.015 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.019 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.011 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.057 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.038 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.022 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.021 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.022 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.113 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.026 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.049 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.311 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.019 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.019 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 116.355us (24.000MiB, est bw: 216.285GB/s, 19.818% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 12, 512) %'input68_local_1442'[i16_0_1448,i15_0_0_0_1,i15_0_0_0_0,c1_1434,i0.128,i3.12,i1.128+128i2.2+256p_1934] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 12, 2, 128) %'input68'[i15_0_0_0_1+2i15_0_0_0_0,p_1934,c1_1434,i0.128,i3.12,i2.2,i1.128] # id=1677, src_id=None, , instances=32 # dl = tensor_op_name: _dot.6 | hlo_id: 51 | [[i0.128];[i1.128, i2.2, i3.12]] -> [[i0.128];[i1.128, i2.2, i3.12]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 110.541us (24.000MiB, est bw: 227.660GB/s, 18.828% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 2, 6, 128, 4096) %'input69_local_1419'[i11_0,i10_0_0_0,i10_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input69'[i10_0_0_0,i10_0_0_1,i0.128,i1.4096] # id=1668, src_id=None, , instances=24 # dl = tensor_op_name: _dot.4 | hlo_id: 40 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 110.541us (24.000MiB, est bw: 227.660GB/s, 18.828% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 2, 6, 128, 4096) %'input71_local_1430'[i16_0_1448,i12_0_0_0,i12_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 4096) %'input71'[i12_0_0_0,i12_0_0_1,i0.128,i1.4096] # id=1671, src_id=None, , instances=24 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 37.714us (8.000MiB, est bw: 222.428GB/s, 6.424% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 4096) %'input78_local_1465'[i2_0_1530,i38_0_0,c1_1458,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input78'[i38_0_0,c1_1458,i0.128,i1.4096] # id=1691, src_id=None, , instances=8 # dl = tensor_op_name: _dot.9 | hlo_id: 71 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 3.323% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input73_local_1524'[i2_0_1530,c0_1518,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input73'[c0_1518,i0.128,i1.4096] # id=1737, src_id=None, , instances=4 # dl = tensor_op_name: _dot.7 | hlo_id: 155 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 3.323% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input76_local_1486'[i67_0,c0_1479,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input76'[c0_1479,i0.128,i1.4096] # id=1714, src_id=None, , instances=4 # dl = tensor_op_name: _dot.8 | hlo_id: 114 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 3.323% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input72_local_1549'[i98_0_0_0,i98_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input72'[i98_0_0_0,i98_0_0_1,i0.128,i1.4096] # id=1742, src_id=None, , instances=4 # dl = tensor_op_name: _dot.10 | hlo_id: 173 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 1.772% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'1367.1898'[i11_0,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (2, 256, 2048) %'add.4'[i11_0,i0.128+128i2.2,i1.2048] # id=1791, src_id=None, , instances=2 # dl = tensor_op_name: add.4_pftranspose_1367 | hlo_id: 15 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 1.772% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 4096) %'_reload_1785'[i16_0_1448,i0.128,i1.4096] = load bfloat16<128 x 4096> DRAM3DBlk partitions[1] bfloat16 (2, 128, 4096) %'_spill_1782'[i16_0_1448,i0.128,i1.4096] # id=1787, src_id=None, , instances=2 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 1.772% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 2, 2048) %'1371.1903'[i2_0_1530,i0.128,i2.2,i1.2048] = load bfloat16<128 x 4096> non_local bfloat16 (1048576,) %'all_reduce.1-buffer-2429'[524288i2_0_1530+2048i0.128+i1.2048+262144i2.2] # id=1800, src_id=None, , instances=2 # dl = tensor_op_name: all_reduce.1_pftranspose_1371 | hlo_id: 54 | [[i0.128];[i1.2048, i2.2]] -> [[i0.128];[i1.2048, i2.2]] +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.016 seconds +2025-11-04T21:38:43Z INFO 8676 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.216 seconds +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.016 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.119 seconds +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8675 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8677 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:43Z INFO 8676 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.499 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.012 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.011 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 23372) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.435 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.012 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.052 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.053 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.013 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.029 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.083 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 37.714us (8.000MiB, est bw: 222.428GB/s, 14.201% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 4096) %'input67_local_1562'[i2_0_1599,i35_0_0,c1_1555,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input67'[i35_0_0,c1_1555,i0.128,i1.4096] # id=1760, src_id=None, , instances=8 # dl = tensor_op_name: _dot.2 | hlo_id: 32 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 26.222us (2.000MiB, est bw: 79.978GB/s, 9.873% of tot. time) for bfloat16<128 x 256> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2, 2, 128) %'intermediate0_pftranspose_1484'[i0_0,i1_0_0,i1_0_1_0,i0.128,p_2161,i2.2,i1.128] = load bfloat16<128 x 256> non_local bfloat16 (2, 4, 2, 128, 2, 256) %'all_gather.1'[i1_0_0,i1_0_1_0,p_2161,i0.128,i0_0,i1.128+128i2.2] # id=1719, src_id=None, , instances=32 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.128];[i1.128, i2.2]] -> [[i0.128];[i1.128, i2.2]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 26.222us (2.000MiB, est bw: 79.978GB/s, 9.873% of tot. time) for bfloat16<128 x 256> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 512) %'custom-call.177.2078'[i2_0_1599,i16_0_0_1544,i16_0_1_0_1544,i0.128,i1.256+256i16_0_1_1_1544] = load bfloat16<128 x 256> non_local bfloat16 (2, 4, 2, 128, 2, 256) %'all_gather.1'[i16_0_0_1544,i16_0_1_0_1544,i16_0_1_1_1544,i0.128,i2_0_1599,i1.256] # id=1755, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.177 | hlo_id: 24 | [[i0.128];[i1.256]] -> [[i0.128];[i1.256]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 7.345% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input62_local_1593'[i2_0_1599,c0_1587,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input62'[c0_1587,i0.128,i1.4096] # id=1854, src_id=None, , instances=4 # dl = tensor_op_name: _dot | hlo_id: 129 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 7.345% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input65_local_1578'[i64_0,c0_1571,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input65'[c0_1571,i0.128,i1.4096] # id=1807, src_id=None, , instances=4 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 19.507us (4.000MiB, est bw: 215.017GB/s, 7.345% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[2] bfloat16 (2, 2, 128, 4096) %'input61_local_1618'[i95_0_0_0,i95_0_0_1,i0.128,i1.4096] = load bfloat16<128 x 4096> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input61'[i95_0_0_0,i95_0_0_1,i0.128,i1.4096] # id=1859, src_id=None, , instances=4 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.4096]] -> [[i0.128];[i1.4096]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 10.403us (2.000MiB, est bw: 201.582GB/s, 3.917% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[1] bfloat16 (2, 128, 8, 4, 128) %'get_tuple_element.1_local_1607'[i95_0_0_0,i0.128,i3.8,i2.4,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (8, 128, 4, 128) %'get_tuple_element.1'[i3.8,i0.128,i2.4,i1.128] # id=1858, src_id=None, , instances=2 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.128, i2.4, i3.8]] -> [[i0.128];[i1.128, i2.4, i3.8]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 9.518us (1.000MiB, est bw: 110.165GB/s, 3.584% of tot. time) for bfloat16<128 x 512> TongaSB partitions[2] bfloat16 (2, 4, 128, 512) %'transpose.1_pftranspose_1479'[T_i2_0_1483,i3_0,i0.128,i1.512] = indirect_load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (151936, 2, 512) %'input60'[i0.128,T_i2_0_1483,i1.512] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[1] int32 (2, 128, 4) %'input0_pftranspose_1475'[T_i2_0_1483,i0.128,i3_0] # id=1717, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=8 # dl = tensor_op_name: _gather.41 | hlo_id: 12 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 8.725us (1.000MiB, est bw: 120.176GB/s, 3.285% of tot. time) for bfloat16<128 x 256> non_local bfloat16 (2, 2, 2, 2, 2, 128, 128) %'reshape.29'[i0_0_1599_2699,i0_1_1599_2699,i1_1599_2699,i2_0_1599,i2.2,i0.128,i1.128] = store bfloat16<128 x 256> TongaSB partitions[2] bfloat16 (2, 2, 128, 2, 2, 2, 128) %1600[i2_0_1599,i0_0_1599_2699,i0.128,i2.2,i0_1_1599_2699,i1_1599_2699,i1.128] # id=1857, src_id=None, , instances=16 # dl = tensor_op_name: _reshape.90 | hlo_id: 134 | [[i0.128];[i1.128, i2.2]] -> [[i0.128];[i1.128, i2.2]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 8.725us (1.000MiB, est bw: 120.176GB/s, 3.285% of tot. time) for bfloat16<128 x 256> non_local bfloat16 (2, 2, 2, 128, 2, 256) %'reshape.16'[T_i41_0_0_1493,T_i41_0_1_1493_2703_2704,T_i41_1_1493_2701_2702_2704,i0.128,T_i40_0_1493,i1.256] = store bfloat16<128 x 256> TongaSB partitions[2] bfloat16 (2, 2, 128, 1024) %'1489.2030'[T_i40_0_1493,T_i41_0_0_1493,i0.128,i1.256+512T_i41_0_1_1493_2703_2704+256T_i41_1_1493_2701_2702_2704] # id=2028, src_id=None, , instances=16 # dl = tensor_op_name: reshape.16_pftranspose_1489 | hlo_id: 79 | [[i0.128];[i1.256]] -> [[i0.128];[i1.256]] +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.016 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.012 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.017 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2630) %4(init=0.0)[i0.32,i1.2374] = load float32<32 x 2374> float32 (32, 2374) %6[i0.32,i1.2374] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2374) %10[i0.32,i1.2374] = load float32<32 x 2374> float32 (1, 75968) %'inp'[i0.32,i1.2374] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 9.509% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.017 seconds +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.011 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:44Z INFO 8677 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8675 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 23372) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.039 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.459 seconds +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.011 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8676 [sg0001/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.043 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.044 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.011 seconds +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.058 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.043 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.020 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8676 [Tensorizer]: BirCodeGen estimate #instances=1821 in sg0001 +2025-11-04T21:38:45Z INFO 8676 [Tensorizer]: IR signature: 32e51aacb8241f10c360f7a312d910d2d33b74fc786037146c5371daaf66ad2c for nc00/sg0001/TensorizerBIR +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.013 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.015 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.043 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.049 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8676 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.044 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.049 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8676 [Tensorizer]: BirCodeGen estimate #instances=1821 in sg0001 +2025-11-04T21:38:45Z INFO 8676 [Tensorizer]: IR signature: 70a13c722494c0d52d76a6bfe1e89eaae6a76672c986572963f54fccf8bff146 for nc01/sg0001/TensorizerBIR +2025-11-04T21:38:45Z INFO 8676 [Tensorizer]: Weights total number of bytes: 163842 +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8676 [Tensorizer]: Successfully built model. +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.030 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.004 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.009 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.033 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.036 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.118 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.045 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.016 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.037 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.040 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [Tensorizer]: BirCodeGen estimate #instances=816 in sg0000 +2025-11-04T21:38:45Z INFO 8675 [Tensorizer]: IR signature: f16af7d1452563922d00658de44867fc5dc87023bd08091c29c1d80592b83a87 for nc00/sg0000/TensorizerBIR +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.023 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8675 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.064 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.041 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.012 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8675 [Tensorizer]: BirCodeGen estimate #instances=816 in sg0000 +2025-11-04T21:38:45Z INFO 8675 [Tensorizer]: IR signature: 7bda5f3abd6c3e5a01de46a3c7fb933d633d71ec8602b64bd907bacc69d9a1c6 for nc01/sg0000/TensorizerBIR +2025-11-04T21:38:45Z INFO 8675 [Tensorizer]: Weights total number of bytes: 213250 +2025-11-04T21:38:45Z INFO 8675 [Tensorizer]: Successfully built model. +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.082 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.063 seconds +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.023 seconds +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.013 seconds +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.017 seconds +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.011 seconds +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 272) %4(init=0.0)[i0.32,i1.16] = load float32<32 x 16> float32 (32, 16) %6[i0.32,i1.16] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 16) %10[i0.32,i1.16] = load float32<32 x 16> float32 (1, 512) %'inp'[i0.32,i1.16] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 12.028% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 4.498 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.052 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.053 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:47Z WARNING 8677 [sg0002/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 67.06 percent of all matmul computation +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.024 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.098 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.020 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.055 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.055 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.037 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.031 seconds +2025-11-04T21:38:47Z INFO 8677 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:48Z INFO 8677 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8677 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.394 seconds +2025-11-04T21:38:48Z INFO 8677 [Tensorizer]: BirCodeGen estimate #instances=25563 in sg0002 +2025-11-04T21:38:48Z INFO 8677 [Tensorizer]: IR signature: 714cc16451808f5633632bb061345a02be740d1644216c11382d7bf436a6afef for nc00/sg0002/TensorizerBIR +2025-11-04T21:38:48Z INFO 8677 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:48Z INFO 8677 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8677 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.437 seconds +2025-11-04T21:38:49Z INFO 8677 [Tensorizer]: BirCodeGen estimate #instances=25563 in sg0002 +2025-11-04T21:38:49Z INFO 8677 [Tensorizer]: IR signature: be97ef8924253e0a8a20f0d72180d6a71773488d3f9bea84cd2b2b509e9315bb for nc01/sg0002/TensorizerBIR +2025-11-04T21:38:49Z INFO 8677 [Tensorizer]: Weights total number of bytes: 410376 +2025-11-04T21:38:49Z INFO 8677 [Tensorizer]: Successfully built model. +2025-11-04T21:38:49Z USER 8593 [root/Tensorizer/Tensorizer]: Tensorizer finished after 15.920 seconds +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: End tensorization +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input60 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input0 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input63 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input67 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input66 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input65 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input64 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input62 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input61 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input4 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input5 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input70 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input71 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input69 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input68 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input74 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input78 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input77 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input76 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input75 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input73 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input72 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input6 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input7 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input367 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input368 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input366 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input365 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input370 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input369 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Network input: input3 +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:49Z INFO 8593 [job.Frontend.0]: Job #0 finished +2025-11-04T21:38:49Z INFO 8593 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:38:49Z INFO 8593 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:38:49Z INFO 8593 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:38:49Z INFO 8593 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: BackendDriver has 6 states with 2 core LNC +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: BackendDriver: found partitions within VNC, using VNC + MT modular flow. +2025-11-04T21:38:49Z INFO 8593 [job.BIRLinker.1]: Creating directory nc00/sgLnk/sg00 +2025-11-04T21:38:49Z INFO 8593 [job.BIRLinker.2]: Creating directory nc01/sgLnk/sg00 +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: BackendDriver in_state.num_states 6 with 2 core LNC +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs nc00/sg00,nc01/sg00,nc00/sg01,nc01/sg01,nc00/sg02,nc01/sg02 --link-dir sgLnk/sg00 --vnc-nc-per-sengine 2 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels scalar_dynamic_offset,vector_dynamic_offsets,spill_reload,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:38:49Z INFO 8593 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:38:49Z INFO 9058 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Loading module from nc01/sg01/bir.json +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Loading module from nc00/sg02/bir.json +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Loading module from nc00/sg01/bir.json +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Loading module from nc01/sg02/bir.json +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Backend driver mtBackend: true numModules: 6 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye" +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Modular flow call graph is enabled +2025-11-04T21:38:49Z INFO 9058 [BackendDriver]: Internal partitioner is enabled +2025-11-04T21:38:49Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1900 blocks=6 instructions=1876 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 87mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 132 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 87mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 87mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 154 memory location(s), 1 block(s), and 99 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 132 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc00/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 87mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z USER 9058 (nc01/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 87mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 589 memory location(s), 1 block(s), and 707 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 589 memory location(s), 1 block(s), and 707 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.232.2296}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 87mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 154 memory location(s), 1 block(s), and 99 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.232.2296}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.008 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 94mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 132 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.017 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 98mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 132 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.038 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 105mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 154 memory location(s), 1 block(s), and 99 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.041 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 107mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 154 memory location(s), 1 block(s), and 99 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.170 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 171mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 589 memory location(s), 1 block(s), and 707 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.185 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 589 memory location(s), 1 block(s), and 707 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.187 seconds +2025-11-04T21:38:49Z INFO 9058 [BackendPassManager]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:49Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=1900 blocks=6 instructions=1876 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z USER 9058 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=308 blocks=2 instructions=198 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 308 memory location(s), 2 block(s), and 198 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1178 blocks=2 instructions=1414 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1178 memory location(s), 2 block(s), and 1414 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=414 blocks=2 instructions=264 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 414 memory location(s), 2 block(s), and 264 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:49Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-11-04T21:38:49Z INFO 9058 [BackendPassManager]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1900 blocks=6 instructions=1876 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 132 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 154 memory location(s), 1 block(s), and 99 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z USER 9058 (nc00/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 176mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 175mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 154 memory location(s), 1 block(s), and 99 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z USER 9058 (nc01/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=154 blocks=1 instructions=99 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 589 memory location(s), 1 block(s), and 707 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 176mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 589 memory location(s), 1 block(s), and 707 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=589 blocks=1 instructions=707 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 177mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 207 memory location(s), 1 block(s), and 132 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=207 blocks=1 instructions=132 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Total count: 816 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Matmult: 417 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: TensorScalarPtr: 99 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: TensorTensor: 72 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: GenericCopy: 64 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Load: 56 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Save: 33 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Activation: 29 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: DMACopy: 21 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 20 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.026 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 202mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 858 memory location(s), 1 block(s), and 816 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=858 blocks=1 instructions=816 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9058 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.009 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 207mb, ru_maxrss: 212mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Total count: 814 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Matmult: 417 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: TensorScalarPtr: 99 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: TensorTensor: 72 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: GenericCopy: 64 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Load: 56 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Save: 32 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Activation: 29 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: DMACopy: 20 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 20 +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.052 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 214mb, ru_maxrss: 214mb (delta=2mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 858 memory location(s), 1 block(s), and 814 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=858 blocks=1 instructions=814 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9058 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 214mb, ru_maxrss: 214mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Total count: 1819 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Matmult: 1402 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: TensorScalarPtr: 81 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: GenericCopy: 78 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Load: 70 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: TensorTensor: 60 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Activation: 54 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Save: 31 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: DMACopy: 17 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 16 +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: unroll finished after 0.075 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 231mb, ru_maxrss: 231mb (delta=19mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 891 memory location(s), 1 block(s), and 1819 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=891 blocks=1 instructions=1819 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9058 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.007 seconds +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 230mb, ru_maxrss: 231mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Total count: 1821 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Matmult: 1402 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: TensorScalarPtr: 81 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: GenericCopy: 78 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Load: 70 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: TensorTensor: 60 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Activation: 54 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Save: 32 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: DMACopy: 18 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 16 +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: unroll finished after 0.091 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 233mb, ru_maxrss: 233mb (delta=21mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 891 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=891 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9058 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.005 seconds +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 232mb, ru_maxrss: 233mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Total count: 13733 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Matmult: 11018 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: GenericCopy: 1432 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Load: 377 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Save: 310 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: TensorTensor: 47 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Activation: 45 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Memset: 22 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: TensorReduce: 8 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: CollectiveCompute: 7 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: unroll finished after 0.314 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 348mb, ru_maxrss: 348mb (delta=136mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5361 memory location(s), 1 block(s), and 13733 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5361 blocks=1 instructions=13733 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Total count: 13744 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Matmult: 11018 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: GenericCopy: 1432 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Load: 377 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Save: 321 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: TensorTensor: 47 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Activation: 45 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Memset: 22 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: TensorReduce: 8 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: CollectiveCompute: 7 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: unroll finished after 0.358 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 316mb, ru_maxrss: 348mb (delta=136mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5361 memory location(s), 1 block(s), and 13744 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5361 blocks=1 instructions=13744 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.062 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.044 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.414 seconds +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=136mb) +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6951 blocks=6 instructions=31953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:50Z USER 9058 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:50Z USER 9058 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:50Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=757 blocks=2 instructions=1627 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=851 blocks=2 instructions=3639 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 757 memory location(s), 2 block(s), and 1627 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 851 memory location(s), 2 block(s), and 3639 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5343 blocks=2 instructions=26687 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5343 memory location(s), 2 block(s), and 26687 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6951 blocks=6 instructions=31953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1254_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1259_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 270mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.022 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 272mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.073 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.076 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.077 seconds +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=6951 blocks=6 instructions=31953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:50Z USER 9058 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:50Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=757 blocks=2 instructions=1627 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:50Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=851 blocks=2 instructions=3639 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=5343 blocks=2 instructions=26687 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 757 memory location(s), 2 block(s), and 1627 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 851 memory location(s), 2 block(s), and 3639 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5343 memory location(s), 2 block(s), and 26687 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=6951 blocks=6 instructions=31953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: instruction_reorder finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 278mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: instruction_reorder finished after 0.003 seconds +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: psum_legalization finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z WARNING 9058 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 496 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z WARNING 9058 (nc01/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: psum_legalization finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: psum_legalization finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.009 seconds +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z WARNING 9058 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 496 bytes/partition +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 279mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Allocs: 378 instructions: 812 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z WARNING 9058 (nc00/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: vn_splitter finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 1944 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Done build fdeps 1944 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 378 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=378 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=379 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: size = 89 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: found 129 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: mean: 2.89888 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: median: 1.9203 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 1032 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Allocs: 379 instructions: 815 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: lo = 89 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: total = 89 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 1946 edges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Done build fdeps 1946 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: constant_propagate finished after 0.029 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: lower_ac finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: remat_optimization finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 20 PSUM Banks +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=379 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=380 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: constant_propagate finished after 0.024 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: lower_ac finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.024 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: size = 89 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: found 129 edges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: mean: 2.89888 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: median: 1.9203 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 1032 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: lo = 89 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: total = 89 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.004 seconds +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z WARNING 9058 (nc01/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 19 PSUM Banks +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 4325376 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1056 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1058816 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: size = 255 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: found 44 accumulation groups +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: largest = _dot.1-t1671_i6 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: tensors = 10 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: requires 24576 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Allocs: 425 instructions: 1818 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: 15 remat count +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Num intervals 255 Num locations 255 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: edge: 3589 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: mean: 28.149 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: median: 23.1726 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: safe = 236 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: unsafe = 17 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: total = 253 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 255 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Total: 253 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (253) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Rover zone: 0.917 (232) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.020 (5) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.063 (16) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.004 (1) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.996 (252) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.980 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 4325376 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1056 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1058816 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 20 PSUM Banks +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 19 PSUM Banks +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 4325378 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1055 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1058816 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: size = 256 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: found 44 accumulation groups +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: largest = _dot.1-t1671_i3 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: tensors = 10 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: requires 24576 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: 15 remat count +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Num intervals 256 Num locations 256 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: edge: 3595 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: mean: 28.0859 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: median: 23.2938 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: safe = 237 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: unsafe = 17 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: total = 254 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 256 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Total: 254 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (254) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Rover zone: 0.917 (233) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.020 (5) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.063 (16) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.004 (1) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.996 (253) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.977 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.007 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 4325378 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1055 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1058816 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=380 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 18007812, 58.5119% input load, 6.55076% output write, 34.9374% spill/reload [sg0000] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 381 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=381 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 18007814, 58.5119% input load, 6.55077% output write, 34.9374% spill/reload [sg0000] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.05367e+07) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.05367e+07) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.068 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 1055 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 4325378 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1055 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 18007814, 58.5119% input load, 6.55077% output write, 34.9374% spill/reload [sg0000] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 4325378 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1055 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 1058816 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1218 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=380 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z WARNING 9058 (nc00/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 280mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 8 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Allocs: 426 instructions: 1821 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.007 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.014 seconds +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: vn_splitter finished after 0.034 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 5117 edges +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Done build fdeps 5117 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 5115 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Done build fdeps 5115 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 1056 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 4325376 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1056 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: pre_sched finished after 0.037 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 18007812, 58.5119% input load, 6.55076% output write, 34.9374% spill/reload [sg0000] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 13682436 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2135 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 4325376 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1056 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 1058816 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1218 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.021 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=379 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=426 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=427 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: pre_sched finished after 0.049 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 281mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: size = 128 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: found 190 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: mean: 2.96875 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: median: 2.0604 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: adjacency vectors require 1520 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: lo = 128 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: total = 128 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 27 PSUM Banks +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=425 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 426 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=426 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 25 PSUM Banks +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: size = 128 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 52748804 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 6150 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5767170 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 532480 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: found 190 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: mean: 2.96875 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: median: 2.0604 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: adjacency vectors require 1520 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: size = 258 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: found 106 accumulation groups +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: largest = _dot.6-t1604_i5 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: tensors = 14 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: requires 36864 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z INFO 9058 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.017 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.009 seconds +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: vn_splitter finished after 0.036 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 282mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 16 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: 45 remat count +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Num intervals 258 Num locations 258 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.042 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=380 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: reserved space = 147712 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: lo = 128 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: total = 128 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=380 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.014 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: edge: 3706 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: mean: 28.7287 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: median: 22.4859 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 16 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: safe = 224 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: unsafe = 28 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: inf = 4 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: total = 256 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 258 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Total: 256 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Allocated: 1.000 (256) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Rover zone: 0.938 (240) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Pre-rover zone: 0.016 (4) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Post-rover zone: 0.047 (12) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Blocks nothing: 0.004 (1) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Blocks tall: 0.996 (255) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.983 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 52748804 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 6150 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5767170 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 532480 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.018 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=380 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 13 out of 72 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 815 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=380 blocks=1 instructions=815 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 380 memory location(s), 1 block(s), and 816 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=380 blocks=1 instructions=816 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 816, number of allocs: 380 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2747-0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 8.2e-05 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2747-0] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: input0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: input1: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: input2: [ 4 512 128 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: output0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 0 +Memory Location: {reshape.16}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 0 +Memory Location: {reshape.24}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 512 / 512 = 1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Scratch sbuf for kernel I-2747-0: [38912, 62284) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 0.006866 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 647 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.037 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=379 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: reserved space = 147712 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=379 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=379 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 27 PSUM Banks +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 13 out of 71 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 812 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=379 blocks=1 instructions=812 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 428 memory location(s), 1 block(s), and 1821 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=428 blocks=1 instructions=1821 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 58515974, 82.9765% input load, 1.79195% output write, 15.2316% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 379 memory location(s), 1 block(s), and 813 instruction(s). Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=379 blocks=1 instructions=813 Max writers: 16 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 813, number of allocs: 379 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2747-0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 5.2e-05 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 25 PSUM Banks +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 131072, 0.223994% out of total dma traffic(4.85545e+07) +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=647 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 647 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=647 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 647 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=647 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 663 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=663 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 663 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=663 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.011 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 283mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 52748804 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 6150 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 5767168 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 532480 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2747-0] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: input0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: input1: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: input2: [ 4 512 128 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: output0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 262144 +Memory Location: {reshape.16}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 262144 +Memory Location: {reshape.24}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 512 / 512 = 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Scratch sbuf for kernel I-2747-0: [38912, 62284) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 0.024091 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 646 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=646 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 646 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=646 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 646 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=646 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: size = 257 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: found 106 accumulation groups +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: largest = _dot.6-t1604_i15 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: tensors = 14 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: requires 36864 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: 45 remat count +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Num intervals 257 Num locations 257 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 663 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=663 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 663 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=663 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Allocs: 663 instructions: 1175 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 662 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=662 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 662 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: edge: 3700 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=662 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: mean: 28.7938 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: median: 22.5831 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2747-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,42244>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: safe = 223 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: unsafe = 28 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: inf = 4 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: total = 255 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 257 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Total: 255 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Allocated: 1.000 (255) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Rover zone: 0.937 (239) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Pre-rover zone: 0.016 (4) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Post-rover zone: 0.047 (12) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Blocks nothing: 0.004 (1) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Blocks tall: 0.996 (254) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.986 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 6323 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 52617732 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 6323 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 5767170 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 52748804 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 6150 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 5767168 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 532480 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.013 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 662 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=662 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 662 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=662 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Allocs: 662 instructions: 1172 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 427 memory location(s), 1 block(s), and 1818 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=427 blocks=1 instructions=1818 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 2497 edges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [build_flow_deps]: Done build fdeps 2497 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 58515972, 82.9765% input load, 1.79195% output write, 15.2316% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 663 memory location(s), 1 block(s), and 1175 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 131072, 0.223994% out of total dma traffic +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 58384902, 82.9383% input load, 1.79597% output write, 15.2658% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=663 blocks=1 instructions=1175 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 52617732 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 6323 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 5767170 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 532480 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3595 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.026 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1819 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=425 blocks=1 instructions=1819 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 284mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 651 memory location(s), 1 block(s), and 1147 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=651 blocks=1 instructions=1147 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 131072, 0.223994% out of total dma traffic(4.85545e+07) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 2495 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [build_flow_deps]: Done build fdeps 2495 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.013 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 285mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 662 memory location(s), 1 block(s), and 1172 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=662 blocks=1 instructions=1172 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 285mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 650 memory location(s), 1 block(s), and 1144 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=650 blocks=1 instructions=1144 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.014 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 285mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 651 memory location(s), 1 block(s), and 1147 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=651 blocks=1 instructions=1147 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 16 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 6323 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 52617732 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 6323 bytes +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.007 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1131 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=634 blocks=1 instructions=1131 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.016 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 5767168 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 650 memory location(s), 1 block(s), and 1144 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=650 blocks=1 instructions=1144 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1131 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 16 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 131072, 0.223994% out of total dma traffic +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 58384900, 82.9383% input load, 1.79597% output write, 15.2658% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 52617732 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 6323 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 5767168 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1453 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 532480 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3595 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.027 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1816 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=424 blocks=1 instructions=1816 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1128 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=633 blocks=1 instructions=1128 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: constant_propagate finished after 0.103 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1128 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: lower_ac finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.049 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1819 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=425 blocks=1 instructions=1819 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: reserved space = 98304 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: spill space = 1048576 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 1048576 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: size = 1 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.015 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: Num intervals 1 Num locations 1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: lo = 1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: total = 1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 1048576 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.007 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1819 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=425 blocks=1 instructions=1819 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 1048576 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: allreduce hwm 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: Real CC buffer size 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 1048576 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1819 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=425 blocks=1 instructions=1819 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyAccel::Impl]: Accelerated 9 out of 88 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1819 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=425 blocks=1 instructions=1819 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 425 memory location(s), 1 block(s), and 1820 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=425 blocks=1 instructions=1820 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 1820, number of allocs: 425 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2446-0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Scan BKs time (s): 0.000192 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2446-0] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: input0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: input1: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: input2: [ 4 512 128 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: output0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 0 +Memory Location: {reshape.60}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 0 +Memory Location: {reshape.68}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 512 / 512 = 1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Scratch sbuf for kernel I-2446-0: [71424, 94796) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: constant_propagate finished after 0.107 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 286mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [LowerKernel]: Lower BKs time (s): 0.005246 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: lower_kernel finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 287mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=692 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 287mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=692 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: lower_ac finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 287mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 287mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 692 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=692 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 287mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 708 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=708 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.045 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 287mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1816 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=424 blocks=1 instructions=1816 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 287mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 708 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=708 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: reserved space = 98304 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: spill space = 1048576 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 1048576 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: size = 1 +2025-11-04T21:38:50Z INFO 9058 []: find first defs for local +2025-11-04T21:38:50Z INFO 9058 []: find first defs for global +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: Num intervals 1 Num locations 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: lo = 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: total = 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 1048576 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 288mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1816 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=424 blocks=1 instructions=1816 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 1048576 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: allreduce hwm 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: Real CC buffer size 2097152 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 1048576 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 288mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1816 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=424 blocks=1 instructions=1816 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyAccel::Impl]: Accelerated 9 out of 87 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 288mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1816 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=424 blocks=1 instructions=1816 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 288mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 424 memory location(s), 1 block(s), and 1817 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=424 blocks=1 instructions=1817 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 1817, number of allocs: 424 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2446-0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Scan BKs time (s): 6.8e-05 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2446-0] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: input0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: input1: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: input2: [ 4 512 128 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: output0: [ 4 128 512 ] +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 262144 +Memory Location: {reshape.60}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[65536,4],[512,128],[1,512]] +Offset: 262144 +Memory Location: {reshape.68}@DRAM(524288x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 512 / 512 = 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Scratch sbuf for kernel I-2446-0: [71424, 94796) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.007 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 289mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: seq_len=512, seq_len2=512, complete_seq_len2=512 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 708 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=708 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [LowerKernel]: Lower BKs time (s): 0.006069 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 708 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=708 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 691 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=691 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 7Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 691 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=691 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Allocs: 708 instructions: 2179 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 691 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=691 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 707 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=707 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 707 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=707 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2446-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,74756>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 707 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=707 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 707 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=707 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 8Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Allocs: 707 instructions: 2176 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 5668 edges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [build_flow_deps]: Done build fdeps 5668 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: build_fdeps finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 708 memory location(s), 1 block(s), and 2179 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=708 blocks=1 instructions=2179 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: remat_optimization finished after 0.043 seconds +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 290mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 696 memory location(s), 1 block(s), and 2151 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=696 blocks=1 instructions=2151 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 5666 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [build_flow_deps]: Done build fdeps 5666 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: build_fdeps finished after 0.008 seconds +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 707 memory location(s), 1 block(s), and 2176 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=707 blocks=1 instructions=2176 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: remove_redundancies finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 291mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 695 memory location(s), 1 block(s), and 2148 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=695 blocks=1 instructions=2148 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 292mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 293mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 12953 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2447 blocks=1 instructions=12953 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: remat_optimization finished after 0.035 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 293mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 293mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.021 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 293mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 696 memory location(s), 1 block(s), and 2151 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=696 blocks=1 instructions=2151 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 16 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 293mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2896 memory location(s), 1 block(s), and 13734 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2896 blocks=1 instructions=13734 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 293mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2135 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=679 blocks=1 instructions=2135 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 294mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2135 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.023 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 294mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 695 memory location(s), 1 block(s), and 2148 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=695 blocks=1 instructions=2148 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 16 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 294mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2132 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=678 blocks=1 instructions=2132 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9058 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z USER 9058 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 294mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2132 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [build_flow_deps]: Allocs: 2898 instructions: 13732 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 10Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [build_flow_deps]: Allocs: 2449 instructions: 12955 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 45794 edges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [build_flow_deps]: Done build fdeps 45794 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 33880 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [build_flow_deps]: Done build fdeps 33880 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: pre_sched finished after 0.169 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 304mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2898 memory location(s), 1 block(s), and 13732 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2898 blocks=1 instructions=13732 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: pre_sched finished after 0.196 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2449 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 63 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.042 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2449 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2450 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2450 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: size = 1042 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: 50% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: PSUM high-water mark = 4 tensors +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: found 1019 edges +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: mean: 1.95585 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: median: 1.40689 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: adjacency vectors require 8152 bytes +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.091 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2835 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2836 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2836 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: lo = 968 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: total = 1042 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: size = 1166 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [PSUM_Allocator]: 50% PSUM utilization after allocation +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.106 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: 50% PSUM demand before spilling +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: PSUM high-water mark = 4 tensors +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: found 1081 edges +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: mean: 1.8542 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: median: 1.32069 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: adjacency vectors require 8648 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: lo = 1092 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: total = 1166 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.040 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [PSUM_Allocator]: 50% PSUM utilization after allocation +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.114 seconds +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.026 seconds +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 21 PSUM Banks +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 487 PSUM Banks +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 23 PSUM Banks +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.069 seconds +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 197160862 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4420 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 3314443 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2743 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: size = 1623 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 24 PSUM Banks +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: found 1161 accumulation groups +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: largest = _dot.199-t1192_i0 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: tensors = 14 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: requires 36864 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:51Z INFO 9058 []: find first defs for local +2025-11-04T21:38:51Z INFO 9058 []: find first defs for global +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: 349 remat count +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.178 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Num intervals 1623 Num locations 1623 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: edge: 9923 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: mean: 12.228 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: median: 6.34918 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 196525458 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4465 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 3299072 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 4062 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: safe = 1595 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: unsafe = 22 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: inf = 4 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: total = 1621 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1623 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Total: 1621 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Allocated: 1.000 (1621) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Rover zone: 0.969 (1571) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Pre-rover zone: 0.023 (38) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Post-rover zone: 0.005 (8) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Slice zone: 0.002 (4) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Blocks nothing: 0.070 (113) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Blocks medium: 0.007 (12) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.573 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.595 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.815 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Blocks tall: 0.923 (1496) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.682 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.966 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: Success +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: size = 1371 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: found 1037 accumulation groups +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: largest = _dot.199-t1192_i13 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: tensors = 14 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: requires 36864 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:51Z INFO 9058 []: find first defs for local +2025-11-04T21:38:51Z INFO 9058 []: find first defs for global +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: 339 remat count +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Num intervals 1371 Num locations 1371 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: edge: 8445 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: mean: 12.3195 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: median: 6.24014 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: safe = 1345 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: unsafe = 20 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: inf = 4 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: total = 1369 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1371 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Total: 1369 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Allocated: 1.000 (1369) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Rover zone: 0.984 (1347) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Pre-rover zone: 0.012 (16) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Post-rover zone: 0.004 (6) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Blocks nothing: 0.017 (23) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Blocks medium: 0.001 (2) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.700 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.714 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.714 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Blocks tall: 0.982 (1344) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.766 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.998 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: Success +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 196525458 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4465 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 3299072 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 4062 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.168 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.015 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12955 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2451 blocks=1 instructions=12955 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 199824530, 96.773% input load, 0% output write, 3.22696% spill/reload [sg0002] +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 197160862 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4420 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 3314443 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2743 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.273 seconds +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.93376e+08) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.029 seconds +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13669 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2837 blocks=1 instructions=13669 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 200475305, 96.6166% input load, 1.99526e-06% output write, 3.38339% spill/reload [sg0002] +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.93692e+08) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: average loaded DMA size 4478 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: average saved DMA size 4822 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 196525202 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4478 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 3298816 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 4822 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 2 spill/reload instructions +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 1 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.00794012% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 512, 0.000256225% out of total dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 199824018, 96.7733% input load, 0% output write, 3.22671% spill/reload [sg0002] +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 196525202 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4478 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 3298816 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 4822 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4481 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.174 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12954 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2449 blocks=1 instructions=12954 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: average loaded DMA size 4433 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: average saved DMA size 3068 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 197160606 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4433 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 3314187 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3068 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 512, 0.00754843% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 512, 0.000255393% out of total dma traffic +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 200474793, 96.6169% input load, 1.99526e-06% output write, 3.38315% spill/reload [sg0002] +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 197160606 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4433 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 3314187 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3068 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4399 bytes +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.157 seconds +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13668 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2835 blocks=1 instructions=13668 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 155 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 175 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.269 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12954 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2449 blocks=1 instructions=12954 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: spill space = 1048576 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 1048576 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: size = 1 +2025-11-04T21:38:51Z INFO 9058 []: find first defs for local +2025-11-04T21:38:51Z INFO 9058 []: find first defs for global +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: Num intervals 1 Num locations 1 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: lo = 1 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: total = 1 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 1048576 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.032 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12954 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2449 blocks=1 instructions=12954 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 1048576 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: allreduce hwm 2097152 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: Real CC buffer size 2097152 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 1048576 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.019 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12954 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2449 blocks=1 instructions=12954 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [TensorCopyAccel::Impl]: Accelerated 597 out of 1243 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12954 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2449 blocks=1 instructions=12954 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 34 Sb address +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: peephole_opts finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 12957, number of allocs: 2449 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [LowerKernel]: Scan BKs time (s): 0.002212 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.010 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1254_i1}@SB<32,16384>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:51Z WARNING 9058 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1259_i1}@SB<96,17536>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.040 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 11Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:51Z INFO 9058 (nc01/sg02) [build_flow_deps]: Allocs: 2449 instructions: 12957 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 33882 edges +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [build_flow_deps]: Done build fdeps 33882 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: build_fdeps finished after 0.079 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 156 Sb address +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: remove_redundancies finished after 0.015 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.526 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 319mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13668 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2835 blocks=1 instructions=13668 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: reserved space = 34824 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: spill space = 1055748 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 1077248 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: size = 8 +2025-11-04T21:38:52Z INFO 9058 []: find first defs for local +2025-11-04T21:38:52Z INFO 9058 []: find first defs for global +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.125 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: lo = 8 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: total = 8 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 1048576 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.087 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 319mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13668 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2835 blocks=1 instructions=13668 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 1048576 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: allreduce hwm 2097152 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: Real CC buffer size 2097152 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 1048576 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.021 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 315mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13668 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2835 blocks=1 instructions=13668 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [TensorCopyAccel::Impl]: Accelerated 597 out of 1382 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 316mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13668 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2835 blocks=1 instructions=13668 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: peephole_opts finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 315mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 13671, number of allocs: 2835 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.065 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2449 blocks=1 instructions=12957 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [LowerKernel]: Scan BKs time (s): 0.005489 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: lower_kernel finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.033 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 315mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2449 memory location(s), 1 block(s), and 12957 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.032 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.035 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 12Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [build_flow_deps]: Allocs: 2835 instructions: 13671 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 45733 edges +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [build_flow_deps]: Done build fdeps 45733 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: build_fdeps finished after 0.059 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: remove_redundancies finished after 0.016 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.131 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 334mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.036 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2835 blocks=1 instructions=13671 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.012 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2835 memory location(s), 1 block(s), and 13671 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 2.232 seconds +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=7908 blocks=6 instructions=33154 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=1357 blocks=2 instructions=4267 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=1267 blocks=2 instructions=2259 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1267 memory location(s), 2 block(s), and 2259 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5284 blocks=2 instructions=26628 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=1267 blocks=2 instructions=2259 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1267 memory location(s), 2 block(s), and 2263 instruction(s). Max writers: 17 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=1267 blocks=2 instructions=2263 Max writers: 17 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5284 memory location(s), 2 block(s), and 26628 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=5284 blocks=2 instructions=26628 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1267 memory location(s), 2 block(s), and 2267 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1357 memory location(s), 2 block(s), and 4267 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=1357 blocks=2 instructions=4267 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1357 memory location(s), 2 block(s), and 4271 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=1357 blocks=2 instructions=4271 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.005 seconds +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.015 seconds +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1357 memory location(s), 2 block(s), and 4275 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5288 memory location(s), 2 block(s), and 26642 instruction(s). Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=5288 blocks=2 instructions=26642 Max writers: 298 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.051 seconds +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5288 memory location(s), 2 block(s), and 26646 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.079 seconds +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33188 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2451 blocks=1 instructions=12966 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: reserved space = 114944 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: spill space = 11534336 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 11534336 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: reserved space = 114944 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: spill space = 11534336 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 11534336 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: size = 8 +2025-11-04T21:38:52Z INFO 9058 []: find first defs for local +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 []: find first defs for global +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: reserved space = 1114112 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: spill space = 14680064 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 14680064 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: lo = 8 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: total = 8 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 7340032 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 7340032 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: reserved space = 1114112 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: spill space = 14680064 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 14680064 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 11534336 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: reserved space = 1090572 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: spill space = 8707074 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 8753152 bytes +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: size = 9 +2025-11-04T21:38:52Z INFO 9058 []: find first defs for local +2025-11-04T21:38:52Z INFO 9058 []: find first defs for global +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: size = 19 +2025-11-04T21:38:52Z INFO 9058 []: find first defs for local +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: Num intervals 9 Num locations 9 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: lo = 9 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: total = 9 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: Already used DRAM hwm: 1048576 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: Already used DRAM hwm: 1048576 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 9437184 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 9437184 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 14680064 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9058 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.009 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 []: find first defs for global +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: reserved space = 1081344 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: spill space = 8707074 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 8753152 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.038 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12966 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: Num intervals 19 Num locations 19 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: lo = 19 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: total = 19 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 1048576 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 1048576 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 5259264 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 5259264 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 7671808 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.072 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.074 seconds +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33188 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=1357 blocks=2 instructions=4275 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=5288 blocks=2 instructions=26646 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg01) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1357 memory location(s), 2 block(s), and 4275 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (sg02) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5288 memory location(s), 2 block(s), and 26646 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=1267 blocks=2 instructions=2267 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1267 memory location(s), 2 block(s), and 2267 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.015 seconds +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33188 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.002 seconds +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2451 blocks=1 instructions=12966 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9058 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.026 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12966 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.032 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.037 seconds +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33188 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:52Z USER 9058 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9058 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=4150 blocks=3 instructions=16954 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=3762 blocks=3 instructions=16234 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.070 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 327mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3762 memory location(s), 3 block(s), and 16234 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.074 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 325mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 4150 memory location(s), 3 block(s), and 16954 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: nc_parallel_pass finished after 0.076 seconds +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: curr_vmrss: 323mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33188 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2451 blocks=1 instructions=12966 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12966 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2451 blocks=1 instructions=12966 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z USER 9058 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 320mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Time-aware simulation time: 370956 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: post_sched finished after 0.036 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Time-aware simulation time: 378537 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Time-aware simulation time: 16236585 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: post_sched finished after 0.065 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 326mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: post_sched finished after 0.027 seconds +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:52Z USER 9058 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Time-aware simulation time: 16441272 +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: post_sched finished after 0.105 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 332mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 332mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.006 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 332mb, ru_maxrss: 348mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Time-aware simulation time: 1324250 +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9058 (nc01/sg02) [ModuleForkPass]: post_sched finished after 0.778 seconds +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 372mb, ru_maxrss: 372mb (delta=24mb) +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12966 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2451 blocks=1 instructions=12966 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc01/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12966 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2451 blocks=1 instructions=12966 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.027 seconds +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12962 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Time-aware simulation time: 1484874 +2025-11-04T21:38:53Z INFO 9058 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9058 (nc00/sg02) [ModuleForkPass]: post_sched finished after 0.927 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 372mb (delta=24mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc00/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.014 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:53Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.951 seconds +2025-11-04T21:38:53Z INFO 9058 [BackendPassManager]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=24mb) +2025-11-04T21:38:53Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:53Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33184 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:53Z USER 9058 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:53Z INFO 9058 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=1357 blocks=2 instructions=4275 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z USER 9058 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:53Z INFO 9058 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5288 blocks=2 instructions=26642 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9058 (sg02) [SubgraphForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z USER 9058 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9058 (sg01) [SubgraphForkPass]: curr_vmrss: 359mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1357 memory location(s), 2 block(s), and 4275 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z INFO 9058 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5288 memory location(s), 2 block(s), and 26642 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=1267 blocks=2 instructions=2267 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z USER 9058 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 359mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1267 memory location(s), 2 block(s), and 2267 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:53Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-11-04T21:38:53Z INFO 9058 [BackendPassManager]: curr_vmrss: 359mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:53Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33184 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2451 blocks=1 instructions=12962 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:53Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 55 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 18 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 60 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 55 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 95 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 36 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 66 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 30 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 21 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 18 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 60 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 30 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 73 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 95 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 95 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 36 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9058 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.105 seconds +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 359mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 95 Sb address +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.111 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 354mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9058 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.101 seconds +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 66 PSUM Banks +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 21 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 72 Sb address +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.030 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.040 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.152 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.042 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.013 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 13Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [build_flow_deps]: Allocs: 634 instructions: 1135 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.012 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 14Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [build_flow_deps]: Allocs: 679 instructions: 2139 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 2509 edges +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [build_flow_deps]: Done build fdeps 2509 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.007 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 4 │ 1244659712 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 43264 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10493444 │ +│ Load │ Internal │ 69 │ 4718592 │ +│ Save │ Internal │ 45 │ 3670016 │ +│ Save │ Internal -> Output │ 4 │ 1179650 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 16 │ 1 │ +│ 32 │ 3 │ +│ 256 │ 50 │ +│ 512 │ 59 │ +│ 1024 │ 5 │ +│ 2048 │ 8 │ +│ 4096 │ 1 │ +│ 8192 │ 12 │ +│ 1048576 │ 16 │ +│ 2097152 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 513 #MatMult-Transposes 105 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ReportStats]: IO Tensor size combined: 457974276 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input63 │ ExternalInput │ bfloat16 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate0 │ Output │ bfloat16 │ 2097152 │ +│ intermediate3 │ Output │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ intermediate3-buffer-2735 │ Internal │ bfloat16 │ 2097152 │ +│ all_gather.1 │ Internal │ bfloat16 │ 2097152 │ +│ dot.4-buffer-2733 │ Internal │ bfloat16 │ 2097152 │ +│ reshape.29 │ Internal │ bfloat16 │ 1048576 │ +│ reshape.24 │ Internal │ bfloat16 │ 1048576 │ +│ transpose.1 │ Internal │ bfloat16 │ 1048576 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 1048576 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1135 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 5641 edges +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [build_flow_deps]: Done build fdeps 5641 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.030 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 15Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [build_flow_deps]: Allocs: 633 instructions: 1132 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 2506 edges +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [build_flow_deps]: Done build fdeps 2506 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.006 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.032 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 4 │ 1244659712 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 43264 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10493444 │ +│ Load │ Internal │ 69 │ 4718592 │ +│ Save │ Internal │ 45 │ 3670016 │ +│ Save │ Internal -> Output │ 3 │ 1179648 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 16 │ 1 │ +│ 32 │ 3 │ +│ 256 │ 50 │ +│ 512 │ 59 │ +│ 1024 │ 5 │ +│ 2048 │ 8 │ +│ 4096 │ 1 │ +│ 8192 │ 12 │ +│ 1048576 │ 16 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 513 #MatMult-Transposes 105 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ReportStats]: IO Tensor size combined: 457974276 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input63 │ ExternalInput │ bfloat16 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate0 │ Output │ bfloat16 │ 2097152 │ +│ intermediate3 │ Output │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ intermediate3-buffer-2735 │ Internal │ bfloat16 │ 2097152 │ +│ all_gather.1 │ Internal │ bfloat16 │ 2097152 │ +│ dot.4-buffer-2733 │ Internal │ bfloat16 │ 2097152 │ +│ reshape.29 │ Internal │ bfloat16 │ 1048576 │ +│ reshape.24 │ Internal │ bfloat16 │ 1048576 │ +│ transpose.1 │ Internal │ bfloat16 │ 1048576 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 1048576 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1132 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: dep_opt finished after 0.038 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.019 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 49152 │ +│ Load │ ExternalInput -> Internal │ 57 │ 48243204 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 40 │ 5767168 │ +│ Save │ Internal │ 46 │ 5242880 │ +│ Save │ Internal -> Output │ 2 │ 1048578 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 6 │ +│ 256 │ 49 │ +│ 512 │ 26 │ +│ 1024 │ 4 │ +│ 2048 │ 4 │ +│ 6144 │ 16 │ +│ 8192 │ 41 │ +│ 1048576 │ 19 │ +│ 2097152 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 16Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ReportStats]: MM Stats: #MatMults 1498 #MatMult-Transposes 116 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [build_flow_deps]: Allocs: 678 instructions: 2136 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Input │ bfloat16 │ 2097152 │ +│ add.4 │ Internal │ bfloat16 │ 2097152 │ +│ dot.7-buffer-2427 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate5 │ Output │ bfloat16 │ 2097152 │ +│ intermediate0 │ Input │ bfloat16 │ 2097152 │ +│ all_reduce.1-buffer-2429 │ Internal │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ dot.11-buffer-2432 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate6-buffer-2434 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate6 │ Output │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: report_stats finished after 0.002 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2139 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 5638 edges +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [build_flow_deps]: Done build fdeps 5638 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: dep_opt finished after 0.008 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 49152 │ +│ Load │ ExternalInput -> Internal │ 57 │ 48243204 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 40 │ 5767168 │ +│ Save │ Internal │ 46 │ 5242880 │ +│ Save │ Internal -> Output │ 1 │ 1048576 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 6 │ +│ 256 │ 49 │ +│ 512 │ 26 │ +│ 1024 │ 4 │ +│ 2048 │ 4 │ +│ 6144 │ 16 │ +│ 8192 │ 41 │ +│ 1048576 │ 19 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ReportStats]: MM Stats: #MatMults 1498 #MatMult-Transposes 116 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Input │ bfloat16 │ 2097152 │ +│ add.4 │ Internal │ bfloat16 │ 2097152 │ +│ dot.7-buffer-2427 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate5 │ Output │ bfloat16 │ 2097152 │ +│ intermediate0 │ Input │ bfloat16 │ 2097152 │ +│ all_reduce.1-buffer-2429 │ Internal │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ dot.11-buffer-2432 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate6-buffer-2434 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate6 │ Output │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 355mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2136 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 710 PSUM Banks +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 22 PSUM Banks +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 879 PSUM Banks +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 37 PSUM Banks +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 24 PSUM Banks +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 33 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 391 PSUM Banks +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 30 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.549 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 357mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12962 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2451 blocks=1 instructions=12962 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.104 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12962 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2451 blocks=1 instructions=12962 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.674 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 359mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.048 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 365mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12962 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2451 blocks=1 instructions=12962 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 17Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [build_flow_deps]: Allocs: 2451 instructions: 12962 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 33813 edges +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [build_flow_deps]: Done build fdeps 33813 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: dep_opt finished after 0.154 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 371mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12962 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2451 blocks=1 instructions=12962 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.197 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 344 │ 193343500 │ +│ Load │ Internal │ 11 │ 3148934 │ +│ Save │ Internal │ 303 │ 3298816 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 1 │ +│ 2048 │ 2 │ +│ 4096 │ 297 │ +│ 6144 │ 16 │ +│ 8192 │ 30 │ +│ 1048576 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ReportStats]: MM Stats: #MatMults 10890 #MatMult-Transposes 5146 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ReportStats]: IO Tensor size combined: 348923920 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 2048 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ convert.53 │ Internal │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ all_reduce.3-buffer-2056 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate84 │ Input │ bfloat16 │ 2097152 │ +│ intermediate83 │ Input │ bfloat16 │ 2097152 │ +│ dot.14-buffer-2054 │ Internal │ bfloat16 │ 2097152 │ +│ add.9 │ Internal │ bfloat16 │ 2097152 │ +│ input365_local_1080_i11 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1080_i10 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1080_i9 │ Internal │ bfloat16 │ 1572864 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: report_stats finished after 0.015 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12962 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.032 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 18Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [build_flow_deps]: Allocs: 2837 instructions: 13680 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 43870 edges +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [build_flow_deps]: Done build fdeps 43870 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: dep_opt finished after 0.071 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal │ 3 │ 2097152 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 344 │ 193343500 │ +│ Load │ Internal │ 25 │ 3468170 │ +│ Save │ Internal │ 320 │ 3314183 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 17 │ +│ 2048 │ 3 │ +│ 4096 │ 297 │ +│ 6144 │ 16 │ +│ 8192 │ 30 │ +│ 9496 │ 2 │ +│ 1048576 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ReportStats]: MM Stats: #MatMults 11014 #MatMult-Transposes 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ReportStats]: IO Tensor size combined: 348923920 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 2048 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ add.9 │ Internal │ bfloat16 │ 2097152 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ convert.53 │ Internal │ bfloat16 │ 2097152 │ +│ intermediate84 │ Input │ bfloat16 │ 2097152 │ +│ intermediate83 │ Input │ bfloat16 │ 2097152 │ +│ dot.14-buffer-2054 │ Internal │ bfloat16 │ 2097152 │ +│ all_reduce.3-buffer-2056 │ Internal │ bfloat16 │ 2097152 │ +│ input365_local_1080_i3 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1080_i2 │ Internal │ bfloat16 │ 1572864 │ +│ input365_local_1080_i1 │ Internal │ bfloat16 │ 1572864 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: report_stats finished after 0.005 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13680 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.984 seconds +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Inputs to assign_trigger_engine: modules=6 functions=6 allocs=7912 blocks=6 instructions=33184 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 78 DMA instructions. Moved 33 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 77 DMA instructions. Moved 32 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [AssignTriggerEngine]: Assigned trigger engine for 48 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [AssignTriggerEngine]: Assigned trigger engine for 47 DMA instructions. Moved 1 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [AssignTriggerEngine]: Assigned trigger engine for 324 DMA instructions. Moved 4 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [AssignTriggerEngine]: Assigned trigger engine for 305 DMA instructions. Moved 2 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9058 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: assign_trigger_engine finished after 0.016 seconds +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Output has 6 module(s), 6 function(s), 7912 memory location(s), 6 block(s), and 33184 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33184 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2451 blocks=1 instructions=12962 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2837 blocks=1 instructions=13680 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.003 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12965 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=679 blocks=1 instructions=2139 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=634 blocks=1 instructions=1135 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=678 blocks=1 instructions=2136 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=633 blocks=1 instructions=1132 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1137 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1134 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2138 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2141 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.005 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13683 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.007 seconds +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=6 functions=6 allocs=7912 blocks=6 instructions=33198 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: assign_hwdge_engine finished after 0.004 seconds +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Output has 6 module(s), 6 function(s), 7912 memory location(s), 6 block(s), and 33198 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33198 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2837 blocks=1 instructions=13683 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2451 blocks=1 instructions=12965 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 7 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 298 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 2 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 1 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 4 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 340 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 3 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 7 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 9 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 24 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 301 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 9 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 9 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 340 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 3 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12965 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2451 blocks=1 instructions=12965 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13683 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2837 blocks=1 instructions=13683 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=633 blocks=1 instructions=1134 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 16 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 73 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 38 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 45 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1134 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=633 blocks=1 instructions=1134 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1134 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=679 blocks=1 instructions=2141 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=634 blocks=1 instructions=1137 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=678 blocks=1 instructions=2138 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 4 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 16 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 73 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 39 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 45 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 16 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 45 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 76 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 46 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1137 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=634 blocks=1 instructions=1137 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2141 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=679 blocks=1 instructions=2141 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 16 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 45 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 75 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 46 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2138 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=678 blocks=1 instructions=2138 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2141 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2138 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.002 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1137 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.007 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.006 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12965 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13683 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.011 seconds +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33198 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:54Z USER 9058 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:54Z INFO 9058 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=3762 blocks=3 instructions=16237 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=4150 blocks=3 instructions=16961 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 3762 memory location(s), 3 block(s), and 16237 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 4150 memory location(s), 3 block(s), and 16961 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: nc_parallel_pass finished after 0.011 seconds +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33198 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=634 blocks=1 instructions=1137 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=679 blocks=1 instructions=2141 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=678 blocks=1 instructions=2138 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2138 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1137 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=678 blocks=1 instructions=2138 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=634 blocks=1 instructions=1137 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2141 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=679 blocks=1 instructions=2141 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2451 blocks=1 instructions=12965 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12965 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2451 blocks=1 instructions=12965 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1137 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=634 blocks=1 instructions=1137 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: lower_control finished after 0.003 seconds +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: lower_control finished after 0.003 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2837 blocks=1 instructions=13683 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 962 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13683 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=633 blocks=1 instructions=1134 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2837 blocks=1 instructions=13683 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1134 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=633 blocks=1 instructions=1134 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2141 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=679 blocks=1 instructions=2141 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1134 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=633 blocks=1 instructions=1134 Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 360mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2138 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=678 blocks=1 instructions=2138 Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 1049 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 1049 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 960 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 1047 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 1047 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 2523 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Finished dependency reduction: 4528 removed, new total 644 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9058 (nc00/sg00) [ModuleForkPass]: dep_reduction finished after 0.019 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 361mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 634 memory location(s), 1 block(s), and 1137 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 2522 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 2659 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 2659 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 2655 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 2655 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Finished dependency reduction: 4524 removed, new total 643 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9058 (nc01/sg00) [ModuleForkPass]: dep_reduction finished after 0.023 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 362mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 633 memory location(s), 1 block(s), and 1134 instruction(s). Max writers: 18 Max Readers: 104 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: lower_control finished after 0.034 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 362mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12965 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z USER 9058 (nc01/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2451 blocks=1 instructions=12965 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Finished dependency reduction: 10941 removed, new total 767 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9058 (nc00/sg01) [ModuleForkPass]: dep_reduction finished after 0.041 seconds +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: lower_control finished after 0.043 seconds +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13683 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z USER 9058 (nc00/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9058 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 679 memory location(s), 1 block(s), and 2141 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2837 blocks=1 instructions=13683 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Finished dependency reduction: 10935 removed, new total 765 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9058 (nc01/sg01) [ModuleForkPass]: dep_reduction finished after 0.045 seconds +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 372mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9058 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 678 memory location(s), 1 block(s), and 2138 instruction(s). Max writers: 24 Max Readers: 385 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 11962 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 12616 +2025-11-04T21:38:54Z INFO 9058 (nc01/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 12616 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 12479 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 13493 +2025-11-04T21:38:54Z INFO 9058 (nc00/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 13493 +2025-11-04T21:38:55Z INFO 9058 (nc01/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sg02) [DepReduction]: Finished dependency reduction: 57479 removed, new total 3160 +2025-11-04T21:38:55Z INFO 9058 (nc01/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:55Z USER 9058 (nc01/sg02) [ModuleForkPass]: dep_reduction finished after 0.139 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 374mb (delta=2mb) +2025-11-04T21:38:55Z INFO 9058 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2451 memory location(s), 1 block(s), and 12965 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sg02) [DepReduction]: Finished dependency reduction: 74197 removed, new total 3975 +2025-11-04T21:38:55Z INFO 9058 (nc00/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:55Z USER 9058 (nc00/sg02) [ModuleForkPass]: dep_reduction finished after 0.172 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 374mb (delta=2mb) +2025-11-04T21:38:55Z INFO 9058 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2837 memory location(s), 1 block(s), and 13683 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:55Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.231 seconds +2025-11-04T21:38:55Z INFO 9058 [BackendPassManager]: curr_vmrss: 371mb, ru_maxrss: 374mb (delta=2mb) +2025-11-04T21:38:55Z USER 9058 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:55Z INFO 9058 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=7912 blocks=6 instructions=33198 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=4150 blocks=3 instructions=16961 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=3762 blocks=3 instructions=16237 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: Added a new SpillReload Que qSPPIOParam0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye/nc00/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye/nc01/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: PostLink Stats: #MatMults 51849 #MatMult-Transposes 8383 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: Total Intermediate MMTs 108 #out: 0 #inp: 108 #symmetric: 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: PostLink Stats: #MatMults 51973 #MatMult-Transposes 8383 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: Total Intermediate MMTs 108 #out: 0 #inp: 108 #symmetric: 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: bir_linker finished after 0.451 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 544mb, ru_maxrss: 544mb (delta=170mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 280411034, 89.988% input load, 0.794628% output write, 9.21733% spill/reload +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: postlnk_dma_report finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running report_stats +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 4 │ 1244659712 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 43264 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10493444 │ +│ Load │ Internal │ 69 │ 4718592 │ +│ Save │ Internal │ 45 │ 3670016 │ +│ Save │ Internal -> Output │ 3 │ 1179648 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 16 │ 1 │ +│ 32 │ 3 │ +│ 256 │ 50 │ +│ 512 │ 59 │ +│ 1024 │ 5 │ +│ 2048 │ 8 │ +│ 4096 │ 1 │ +│ 8192 │ 12 │ +│ 1048576 │ 16 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 49152 │ +│ Load │ ExternalInput -> Internal │ 57 │ 48243204 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 40 │ 5767168 │ +│ Save │ Internal │ 46 │ 5242880 │ +│ Save │ Internal -> Output │ 1 │ 1048576 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 6 │ +│ 256 │ 49 │ +│ 512 │ 26 │ +│ 1024 │ 4 │ +│ 2048 │ 4 │ +│ 6144 │ 16 │ +│ 8192 │ 41 │ +│ 1048576 │ 19 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal │ 1 │ 2097152 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 344 │ 193343500 │ +│ Load │ Internal │ 11 │ 3148934 │ +│ Save │ Internal │ 303 │ 3298816 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 1 │ +│ 2048 │ 2 │ +│ 4096 │ 297 │ +│ 6144 │ 16 │ +│ 8192 │ 30 │ +│ 1048576 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: MM Stats: #MatMults 12901 #MatMult-Transposes 5367 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: bir_linker finished after 0.504 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=170mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: IO Tensor size combined: 6781420588 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 2097152 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: report_stats finished after 0.017 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:55Z INFO 9058 []: find first defs for local +2025-11-04T21:38:55Z INFO 9058 []: find first defs for global +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: Real CC buffer size 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 281061813, 89.8922% input load, 0.792791% output write, 9.31504% spill/reload +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: postlnk_dma_report finished after 0.010 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running report_stats +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 4 │ 1244659712 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 43264 │ +│ Load │ ExternalInput -> Internal │ 18 │ 10493444 │ +│ Load │ Internal │ 69 │ 4718592 │ +│ Save │ Internal │ 45 │ 3670016 │ +│ Save │ Internal -> Output │ 4 │ 1179650 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 16 │ 1 │ +│ 32 │ 3 │ +│ 256 │ 50 │ +│ 512 │ 59 │ +│ 1024 │ 5 │ +│ 2048 │ 8 │ +│ 4096 │ 1 │ +│ 8192 │ 12 │ +│ 1048576 │ 16 │ +│ 2097152 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal -> ExternalOutput │ 16 │ 536870912 │ +│ DMACopy │ Internal -> Output │ 1 │ 4194304 │ +│ DMACopy (Spill) │ Internal │ 16 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 49152 │ +│ Load │ ExternalInput -> Internal │ 57 │ 48243204 │ +│ Load │ Input -> Internal │ 2 │ 131072 │ +│ Load │ Internal │ 40 │ 5767168 │ +│ Save │ Internal │ 46 │ 5242880 │ +│ Save │ Internal -> Output │ 2 │ 1048578 │ +└─────────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 6 │ +│ 256 │ 49 │ +│ 512 │ 26 │ +│ 1024 │ 4 │ +│ 2048 │ 4 │ +│ 6144 │ 16 │ +│ 8192 │ 41 │ +│ 1048576 │ 19 │ +│ 2097152 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 3145728 │ +│ DMACopy │ Internal │ 3 │ 2097152 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 344 │ 193343500 │ +│ Load │ Internal │ 25 │ 3468170 │ +│ Save │ Internal │ 320 │ 3314183 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 17 │ +│ 2048 │ 3 │ +│ 4096 │ 297 │ +│ 6144 │ 16 │ +│ 8192 │ 30 │ +│ 9496 │ 2 │ +│ 1048576 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: MM Stats: #MatMults 13025 #MatMult-Transposes 5367 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: IO Tensor size combined: 6781420588 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 2097152 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 2097152 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: report_stats finished after 0.024 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:55Z INFO 9058 []: find first defs for local +2025-11-04T21:38:55Z INFO 9058 []: find first defs for global +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.062 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: spill space = 117702712 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 117817344 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.050 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: spill space = 117702712 bytes +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 117817344 bytes +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: size = 86 +2025-11-04T21:38:55Z INFO 9058 []: find first defs for local +2025-11-04T21:38:55Z INFO 9058 []: find first defs for global +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 86 Num locations 86 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: lo = 86 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: total = 86 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 14680064 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 23343104 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.059 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.067 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:55Z USER 9058 [BackendPassManager]: nc_parallel_pass finished after 0.683 seconds +2025-11-04T21:38:55Z INFO 9058 [BackendPassManager]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=170mb) +2025-11-04T21:38:55Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:55Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=8940 blocks=8 instructions=33282 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:55Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=8 allocs=8940 blocks=8 instructions=33282 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.004 seconds +2025-11-04T21:38:55Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 8940 memory location(s), 8 block(s), and 33282 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:55Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.013 seconds +2025-11-04T21:38:55Z INFO 9058 [BackendPassManager]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z USER 9058 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:55Z INFO 9058 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=8 allocs=8940 blocks=8 instructions=33282 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.028 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.045 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.011 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.020 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17003 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=4664 blocks=4 instructions=17003 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17010 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=4664 blocks=4 instructions=17010 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.025 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16279 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=4276 blocks=4 instructions=16279 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc01/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 397mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16286 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:55Z INFO 9058 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=4276 blocks=4 instructions=16286 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z INFO 9058 (nc00/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1780/1780 (100% DGE) + power-of-2 partition : 1808/1871 (96.6328% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1809/1872 (96.6346% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 2587/2587 (100% DGE) + power-of-2 partition : 2587/2929 (88.3237% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2587/2929 (88.3237% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 28 + Transpose : 448 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 452/452 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: lower_dma finished after 0.070 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17010 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=4664 blocks=4 instructions=17010 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: expand_all_engine finished after 0.010 seconds +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17010 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:55Z USER 9058 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:55Z INFO 9058 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=4664 blocks=4 instructions=17010 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1780/1780 (100% DGE) + power-of-2 partition : 1780/1814 (98.1257% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1781/1815 (98.1267% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 2582/2582 (100% DGE) + power-of-2 partition : 2582/2891 (89.3117% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2582/2891 (89.3117% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 28 + Transpose : 448 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 452/452 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: lower_dma finished after 0.094 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16286 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=4276 blocks=4 instructions=16286 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.049 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17010 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=4664 blocks=4 instructions=17010 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: expand_all_engine finished after 0.011 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16286 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=4276 blocks=4 instructions=16286 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: expand_inst_late finished after 0.031 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17089 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=4664 blocks=4 instructions=17089 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [SeqInstOpt]: Removing 36 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [SeqInstOpt]: Removing 33 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 17020 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=4664 blocks=4 instructions=17020 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: lower_sync finished after 0.011 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 18176 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=4664 blocks=4 instructions=18176 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: lower_act finished after 0.004 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 18190 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=4664 blocks=4 instructions=18190 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.051 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 398mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16286 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=4276 blocks=4 instructions=16286 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: expand_inst_late finished after 0.038 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 401mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16365 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=4276 blocks=4 instructions=16365 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [SeqInstOpt]: Removing 36 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [SeqInstOpt]: Removing 33 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 401mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 16296 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=4276 blocks=4 instructions=16296 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: lower_sync finished after 0.010 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 401mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 17296 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=4276 blocks=4 instructions=17296 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: lower_act finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 401mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 17309 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=4276 blocks=4 instructions=17309 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: lower_dve finished after 0.070 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 401mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 18190 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=4664 blocks=4 instructions=18190 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: lower_ap finished after 0.005 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 402mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 18190 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=4664 blocks=4 instructions=18190 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9058 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9058 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:56Z INFO 9058 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9058 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9058 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9058 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z USER 9058 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.099 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: curr_vmrss: 404mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 18190 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: lower_dve finished after 0.121 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 404mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 17309 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=4276 blocks=4 instructions=17309 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: lower_ap finished after 0.010 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 404mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 17309 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=4276 blocks=4 instructions=17309 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9058 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9058 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:56Z INFO 9058 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9058 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9058 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9058 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z USER 9058 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.114 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 17309 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: nc_parallel_pass finished after 0.575 seconds +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: vnc_remote_addr_map finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Output has 2 module(s), 8 function(s), 8940 memory location(s), 8 block(s), and 35499 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: Running vnc_link +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 [VncLink]: Found 0 remote updates +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: vnc_link finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Output has 2 module(s), 8 function(s), 8940 memory location(s), 8 block(s), and 35499 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=4276 blocks=4 instructions=17309 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=4664 blocks=4 instructions=18190 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01/sgLnk) [ModuleForkPass]: birverifier finished after 0.120 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 17309 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00/sgLnk) [ModuleForkPass]: birverifier finished after 0.115 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 18190 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.124 seconds +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:56Z INFO 9058 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.007 seconds +2025-11-04T21:38:56Z INFO 9058 (sg00) [SubgraphForkPass]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 8940 memory location(s), 8 block(s), and 35499 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: subgraph_parallel_pass finished after 0.008 seconds +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: curr_vmrss: 405mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc00/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:56Z USER 9058 (nc01/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=4664 blocks=4 instructions=18190 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89232 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000489246 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=4276 blocks=4 instructions=17309 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89232 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000487331 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 13243 │ +│ LDWEIGHTS │ 13234 │ +│ EVENT_SEMAPHORE │ 1156 │ +│ COPY │ 812 │ +│ CAST │ 700 │ +│ UNKNOWN(0xd4) │ 676 │ +│ PSEUDO_DMA_TRIGGER │ 390 │ +│ ACTIVATE │ 319 │ +│ POOL_BUFFER_LOAD │ 291 │ +│ GATHER │ 291 │ +│ TENSOR_TENSOR │ 243 │ +│ UNKNOWN(0xd3) │ 145 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ MEMSET │ 76 │ +│ TENSOR_SCALAR_ADDR │ 75 │ +│ UNKNOWN(0xd8) │ 48 │ +│ UNKNOWN(0xda) │ 44 │ +│ TENSOR_REDUCE │ 40 │ +│ UNKNOWN(0x92) │ 40 │ +│ RECIPROCAL │ 35 │ +│ UNKNOWN(0x24) │ 32 │ +│ UNKNOWN(0x9a) │ 32 │ +│ UNKNOWN(0x9b) │ 32 │ +│ TENSOR_SCALAR │ 28 │ +│ STREAM_SHUFFLE │ 20 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ LOAD_MASK_SELECT │ 20 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 14 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ MOVE │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ UNKNOWN(0xe8) │ 5 │ +│ ALU_OP │ 2 │ +│ IOTA │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 1827 │ +│ Scalar │ 2713 │ +│ Tensor │ 26620 │ +│ SyncDMA │ 0 │ +│ Vector │ 1259 │ +│ Sync │ 226 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:56Z USER 9058 (nc00/sgLnk) [Codegen]: isa_gen finished after 0.216 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 602 │ +│ qDVESpillReload0_defId_2 │ 142 │ +│ qPoolSpillReload0_defId_0 │ 16384 │ +│ qPoolSpillReload0_defId_1 │ 16384 │ +│ qPoolSpillReload0_defId_2 │ 204 │ +│ qSPIO0 │ 21582 │ +│ qSPPIOParam0 │ 56 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 358 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 55714 (0.000830203 GB) +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qSPPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 144 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2446-0_grp_2_sec_0_mhlo_exponential_6_b0_i0_sg0001 │ Internal │ bfloat16 │ 4 │ +│ I-2446-0_b1_grp_1_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 4 │ +│ I-2747-0_grp_0_sec_0_mhlo_exponential_6_b1_i0_sg0000 │ Internal │ bfloat16 │ 4 │ +│ I-2747-0_b1_grp_0_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 4 │ +│ I-2446-0_b3_grp_0_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 4 │ +│ compare.2.1768_sg0001 │ Internal │ int32 │ 27 │ +│ all-reduce.465.2447_sg0001 │ Internal │ bfloat16 │ 27 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 298 │ +└──────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9058 (nc00/sgLnk) [Codegen]: dma_desc_gen finished after 0.015 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 12995 │ +│ LDWEIGHTS │ 12987 │ +│ EVENT_SEMAPHORE │ 1000 │ +│ CAST │ 700 │ +│ COPY │ 684 │ +│ UNKNOWN(0xd4) │ 669 │ +│ PSEUDO_DMA_TRIGGER │ 352 │ +│ ACTIVATE │ 312 │ +│ TENSOR_TENSOR │ 241 │ +│ UNKNOWN(0xd3) │ 145 │ +│ TENSOR_SCALAR_ADDR │ 75 │ +│ MEMSET │ 62 │ +│ UNKNOWN(0xd8) │ 48 │ +│ UNKNOWN(0xda) │ 44 │ +│ UNKNOWN(0x92) │ 40 │ +│ TENSOR_REDUCE │ 35 │ +│ RECIPROCAL │ 33 │ +│ UNKNOWN(0x9a) │ 32 │ +│ UNKNOWN(0x9b) │ 32 │ +│ UNKNOWN(0x24) │ 32 │ +│ TENSOR_SCALAR │ 26 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ STREAM_SHUFFLE │ 16 │ +│ LOAD_MASK_SELECT │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 13 │ +│ MOVE │ 7 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ UNKNOWN(0xe8) │ 5 │ +│ IOTA │ 2 │ +│ ALU_OP │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 1182 │ +│ Scalar │ 2563 │ +│ Tensor │ 26126 │ +│ SyncDMA │ 0 │ +│ Vector │ 619 │ +│ Sync │ 192 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:56Z USER 9058 (nc01/sgLnk) [Codegen]: isa_gen finished after 0.245 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 596 │ +│ qDVESpillReload0_defId_2 │ 2 │ +│ qPoolSpillReload0_defId_0 │ 16384 │ +│ qPoolSpillReload0_defId_1 │ 16384 │ +│ qPoolSpillReload0_defId_2 │ 4 │ +│ qSPIO0 │ 21580 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 14 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 54966 (0.000819057 GB) +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2446-0_grp_1_sec_0_mhlo_exponential_6_b2_i0_sg0001 │ Internal │ bfloat16 │ 4 │ +│ I-2446-0_grp_2_sec_0_mhlo_exponential_6_b1_i0_sg0001 │ Internal │ bfloat16 │ 4 │ +│ I-2446-0_b1_grp_2_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 4 │ +│ I-2747-0_b1_grp_0_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 4 │ +│ I-2446-0_grp_3_sec_0_mhlo_exponential_6_b3_i0_sg0001 │ Internal │ bfloat16 │ 4 │ +│ I-2747-0_grp_0_sec_0_mhlo_exponential_6_b1_i0_sg0000 │ Internal │ bfloat16 │ 4 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ compare.2.1768_sg0001 │ Internal │ int32 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 297 │ +└──────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9058 (nc01/sgLnk) [Codegen]: dma_desc_gen finished after 0.012 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:56Z USER 9058 (nc00/sgLnk) [Codegen]: debug_info_gen finished after 0.040 seconds +2025-11-04T21:38:56Z USER 9058 (nc00/sgLnk) [ModuleForkPass]: codegen finished after 0.281 seconds +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 406mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 4664 memory location(s), 4 block(s), and 18190 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 (nc01/sgLnk) [Codegen]: debug_info_gen finished after 0.032 seconds +2025-11-04T21:38:56Z USER 9058 (nc01/sgLnk) [ModuleForkPass]: codegen finished after 0.300 seconds +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 406mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 4276 memory location(s), 4 block(s), and 17309 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: mod_parallel_pass finished after 0.310 seconds +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: curr_vmrss: 406mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: Running hbm_usage +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [HBMUsage]: +┌───────────────┬───────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼───────────┼───────────────────┤ +│ Copy │ 1.219KB │ 21.312KB │ +│ CCE │ 336.000KB │ 0.000B │ +│ Transpose │ 0.000B │ 512.000KB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 16.000KB │ 101.000KB │ +└───────────────┴───────────┴───────────────────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc00/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.667GB │ +│ Model Code │ 1.992MB │ +│ Model Constants │ 513.012KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 22.258MB │ +│ DMA Ring IO │ 353.219KB │ +│ DMA Ring Spill │ 634.312KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [HBMUsage]: +┌───────────────┬───────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼───────────┼───────────────────┤ +│ Copy │ 1.188KB │ 9.656KB │ +│ CCE │ 336.000KB │ 0.000B │ +│ Transpose │ 0.000B │ 512.000KB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 15.750KB │ 85.250KB │ +└───────────────┴───────────┴───────────────────┘ + +2025-11-04T21:38:56Z INFO 9058 (nc01/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.659GB │ +│ Model Code │ 1.873MB │ +│ Model Constants │ 511.004KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 14.000MB │ +│ DMA Ring IO │ 352.938KB │ +│ DMA Ring Spill │ 606.906KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:56Z INFO 9058 [HBMUsage]: Total estimated HBM usage is: 3.684GB +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: hbm_usage finished after 0.006 seconds +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: curr_vmrss: 406mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Output has 2 module(s), 8 function(s), 8940 memory location(s), 8 block(s), and 35499 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z USER 9058 [BackendPassManager]: Running neff_packager +2025-11-04T21:38:56Z INFO 9058 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=8 allocs=8940 blocks=8 instructions=35499 Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1707_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1626-1709_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1637-1711_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2012_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1999_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1557-1651_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1568-1653_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1579-1655_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1589-1657_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1794_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.27-1133-1341_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1565_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1707_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1626-1709_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1637-1711_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2012_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1999_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1557-1651_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1568-1653_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1579-1655_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1589-1657_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1794_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1565_CRSM.npy +2025-11-04T21:38:56Z INFO 9058 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:56Z WARNING 9058 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye/metrics.json +2025-11-04T21:38:57Z WARNING 9058 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:38:57Z INFO 9058 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff +2025-11-04T21:38:57Z INFO 9058 [NeffFileWriter]: IR signature: 91d9803af3451cd6c88be76ef0722a10 for neff artifacts +2025-11-04T21:38:57Z USER 9058 [BackendPassManager]: neff_packager finished after 0.154 seconds +2025-11-04T21:38:57Z INFO 9058 [BackendPassManager]: curr_vmrss: 406mb, ru_maxrss: 544mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9058 [BackendPassManager]: Output has 2 module(s), 8 function(s), 8940 memory location(s), 8 block(s), and 35499 instruction(s). Max writers: 299 Max Readers: 5146 +2025-11-04T21:38:57Z INFO 9058 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.010742 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.010742 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local and shared │ 0.013672 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.000977 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: shared │ 0.013672 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local and shared │ 0.007145 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.001003 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: shared │ 0.008152 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.013672 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.021740 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.109726 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc01 │ sg01 │ Total size of allocated tensors: local │ 0.000977 GB │ +│ nc01 │ sg02 │ Peak scratchpad usage: local │ 0.000977 GB │ +│ nc01 │ sg02 │ Total size of allocated tensors: local │ 0.000977 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000977 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.021740 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9058 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ all_gather.1 │ bfloat16 │ 1 │ 2.000000 MB │ +│ reshape.16 │ bfloat16 │ 1 │ 1.000000 MB │ +│ reshape.24 │ bfloat16 │ 1 │ 1.000000 MB │ +│ reshape.29 │ bfloat16 │ 1 │ 1.000000 MB │ +│ transpose.1 │ bfloat16 │ 1 │ 1.000000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9058 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg02, addr_space=local (complete data located at nc00/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ _spill_1782 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9058 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc00 (complete data located at nc00//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.125000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.125000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9058 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg02, addr_space=local (complete data located at nc01/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ _spill_1508 │ bfloat16 │ 1 │ 0.000008 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9058 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc01 (complete data located at nc01//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 2.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.125000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.125000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9058 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:38:57Z INFO 8593 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:38:57Z INFO 8593 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:38:57Z INFO 8593 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye +2025-11-04T21:38:57Z INFO 8593 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:38:57Z INFO 8593 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:38:57Z INFO 8593 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:38:57Z INFO 8593 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:38:57Z INFO 8593 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:38:57Z INFO 8593 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk2/neuronxcc-e1w4faye/hlo_netlist.json +2025-11-04T21:38:57Z INFO 8593 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:38:57Z INFO 8593 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:38:57Z INFO 8593 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:38:57Z INFO 8563 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk2/metaneff.pb b/context_encoding_model/_tp0_bk2/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..a9a6fd60297e2ac568d37b0f1ecb5bee94e91d54 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e16c9f7e6763d8d2b02577a4b90bcb120069c7fe5bb1001520c159d08abf614c +size 2610412 diff --git a/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb b/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..ec849d2d902957fe7920a6dc5178051dacd6a33f --- /dev/null +++ b/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c22ff4f27dafd3772342a93352c9b5a2c076d1824cec83419ac3d1f8c07d4e2f +size 2697198 diff --git a/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff b/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff new file mode 100644 index 0000000000000000000000000000000000000000..41be7b180370f733e71690de9d7626ef8e0b9564 --- /dev/null +++ b/context_encoding_model/_tp0_bk2/model.MODULE_49bb42f69f5b159ae769+3467f95e.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06ba2911f0e007b1f4ad7d888115d6589d3bf2b988bbc6b3bc84a1db0766bb48 +size 1342464 diff --git a/context_encoding_model/_tp0_bk2/neuron_config.json b/context_encoding_model/_tp0_bk2/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1d4b91599bc5f0884f9fb3c01a0e8e4e9b69754f --- /dev/null +++ b/context_encoding_model/_tp0_bk2/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 512 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 512 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 4096, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk3/command.txt b/context_encoding_model/_tp0_bk3/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..032b8f2aadc0ebf0f153fb1a67a3c61ea68447fb --- /dev/null +++ b/context_encoding_model/_tp0_bk3/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb --output model.MODULE_be035899334776123ed5+d208bdce.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/compile_flags.MODULE_be035899334776123ed5+d208bdce.json b/context_encoding_model/_tp0_bk3/compile_flags.MODULE_be035899334776123ed5+d208bdce.json new file mode 100644 index 0000000000000000000000000000000000000000..d657ce3e156c991c3aa29ce623fc6cf5d9db87dc --- /dev/null +++ b/context_encoding_model/_tp0_bk3/compile_flags.MODULE_be035899334776123ed5+d208bdce.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/global_metric_store.json b/context_encoding_model/_tp0_bk3/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..1104ee08af0dfffd06202981762b4dbce9137337 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/global_metric_store.json @@ -0,0 +1,1177 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.80319213867188, + "StaticProfiler::AveragePartitionUtilization": 94.51075744628906, + "StaticProfiler::AveragePeUtilization": 96.83863067626953, + "StaticProfiler::LocalizationEfficiency": 84.98564147949219, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.59233093261719, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.057534217834472656, + "AffinePredicateResolution": 0.0009605884552001953, + "AliasDependencyElimination": 0.00025153160095214844, + "AliasDependencyInduction": 0.006276607513427734, + "AliasDependencyReset": 0.027743816375732422, + "BFComputeCutting": 0.0031321048736572266, + "BirCodeGenLoop": 0.5169932842254639, + "CCOpFusion": 0.05496716499328613, + "CanonicalizeConv": 1.8000000636675395e-05, + "CanonicalizeDAGForPGTiling": 0.010706663131713867, + "CanonicalizeForTensorizer": 3.7000001611886546e-05, + "CanonicalizeIR": 0.00154876708984375, + "Canonicalizer": 0.0007949999999254942, + "CoalesceCCOp": 0.0278623104095459, + "CommuteConcat": 0.001708984375, + "DMALocalityOpt": 0.010039329528808594, + "DMAProfiler": 0.031324148178100586, + "DMATilingProfiler": 0.011522531509399414, + "DataLocalityOpt": 0.28015780448913574, + "DataStreaming": 0.031224727630615234, + "DeConcat": 0.002462148666381836, + "DeadCodeElimination": 0.0021996498107910156, + "DeadStoreElimination": 0.007483243942260742, + "DelinearIndices": 0.008810281753540039, + "Delinearization": 0.009731292724609375, + "DelinearizeSPMD": 0.04425859451293945, + "DoNothing": 0.006867170333862305, + "DramToDramTranspose": 0.012907743453979492, + "DumpGraphAndMetadata": 0.07597684860229492, + "EliminateDivs": 0.0021903514862060547, + "ExpandBatchNorm": 0.001527547836303711, + "ExpandISAMacro": 0.024112701416015625, + "FactorizeBlkDims": 0.05227327346801758, + "FactorizeThreadAxesInFreeDims": 0.003031015396118164, + "FlattenMacroLoop": 0.004990577697753906, + "GenericAccessSimplifier": 0.0007598400115966797, + "HoistCompute": 1.2000000424450263e-05, + "IdentifyCrossPassTensors": 5.0000002374872565e-05, + "InferInitValue": 0.10130023956298828, + "InferIntrinsicOnCC": 0.007919549942016602, + "InferNeuronTensor": 0.05837249755859375, + "InferNonlocalTensors": 0.05706453323364258, + "InferPSumTensor": 0.06946349143981934, + "InferShardAxis": 0.4604020118713379, + "InferSharedMemLoc": 0.05161857604980469, + "InlineNativeKernels": 0.006569623947143555, + "InsertCoreBarrier": 0.018887758255004883, + "InsertIOTransposes": 0.0684211254119873, + "InsertImplicitShardAxisBeforeISel": 0.01549673080444336, + "InsertLocalTransposes": 0.022176742553710938, + "InsertOffloadedTransposes": 0.0181121826171875, + "LICM": 0.007555484771728516, + "LateLegalizeInst": 0.0287015438079834, + "LateLegalizePostSplit": 0.01993083953857422, + "LateLowerReshapeOp": 0.0016782283782958984, + "LateLowerTensorOp": 0.0021178722381591797, + "LateNeuronInstComb": 0.05098986625671387, + "LayoutPreprocessing": 0.10170960426330566, + "LayoutPreprocessingAndAnalysis": 0.23344039916992188, + "LayoutRequirementAnalysis": 0.032952308654785156, + "LegalizeCCOpLayout": 0.002583742141723633, + "LegalizeOpLevelAlias": 0.002170562744140625, + "LegalizePartitionReduce": 0.0025551319122314453, + "LegalizeSundaAccess": 0.1115577220916748, + "LegalizeSundaMacro": 0.04086017608642578, + "LegalizeType": 0.033699750900268555, + "LocalLayoutOpt": 0.023218154907226563, + "LoopFusion": 0.005990266799926758, + "LoopSplitting": 0.0007989406585693359, + "LowerBroadcast": 0.011745214462280273, + "LowerCCOpBlockAxis": 0.007201671600341797, + "LowerComplexBroadcast": 0.00890207290649414, + "LowerIntrinsics": 0.10557985305786133, + "LowerShardAxis": 0.023633956909179688, + "LowerTensorOp": 0.03027796745300293, + "LowerToSendRecv": 0.027859210968017578, + "LowerTranspose": 0.028818368911743164, + "MacroGeneration": 0.12761783599853516, + "MaskPropagation": 0.01400303840637207, + "MemcastMotion": 2.7999998565064743e-05, + "MemcpyElimination": 0.03596854209899902, + "MutateDataType": 0.0020971298217773438, + "NeuronAliasDependencyInduction": 0.0019202232360839844, + "NeuronAliasDependencyReset": 0.027405738830566406, + "NeuronInstComb": 0.048494815826416016, + "NeuronLICM": 0.052613019943237305, + "NeuronLoopFusion": 0.06255030632019043, + "NeuronLoopInterchange": 0.002681255340576172, + "NeuronSimplifier": 0.01907205581665039, + "NeuronSimplifyPredicates": 0.04273796081542969, + "NeuronValueNumbering": 0.019763708114624023, + "OptimizeAliasedCopyChain": 0.0005273818969726563, + "OptimizeNKIKernels": 4.391921043395996, + "PAGLayoutOpt": 0.16190624237060547, + "PComputeCutting": 0.016373872756958008, + "PGLayoutTilingPipeline": 2.0541465282440186, + "PGTiling": 0.3632845878601074, + "PadElimination": 0.0006501674652099609, + "ParAxesAnnotation": 0.08851456642150879, + "PartialLoopFusion": 0.05034661293029785, + "PartialSimdFusion": 0.014182329177856445, + "PenguinizeFunctions": 3.899999865097925e-05, + "PerfectLoopNest": 0.0036270618438720703, + "PruneFunctions": 3.7999998312443495e-05, + "RecognizeOpIdiom": 0.007064342498779297, + "Recompute": 0.00046062469482421875, + "RelaxPredicates": 0.02269601821899414, + "Rematerialization": 0.0019779205322265625, + "RemoveOptimizationBarriers": 4.400000034365803e-05, + "RemoveShardedPartitionAxes": 0.014830350875854492, + "ReshapeWeights": 0.0021474361419677734, + "ResolveAccessConflict": 0.007428646087646484, + "ResolveComplicatePredicates": 0.001834869384765625, + "RewriteReplicationMatmul": 0.006201982498168945, + "RewriteWeights": 0.004793643951416016, + "SFKVectorizer": 0.41699957847595215, + "ScatterMotion": 3.80000019504223e-05, + "ShardingPropagationAnalysis": 0.2801475524902344, + "SimpleAllReduceTiling": 0.025059938430786133, + "Simplifier": 0.003251314163208008, + "SimplifyMacroPredicates": 0.03280019760131836, + "SimplifyNeuronTensor": 0.14811110496520996, + "SimplifySlice": 0.0008628368377685547, + "SimplifyTensor": 0.014911413192749023, + "SpillPSum": 0.0687708854675293, + "SplitAPUnionSets": 0.09714126586914063, + "SplitAccGrp": 0.006166219711303711, + "StaticProfiler": 0.021403789520263672, + "StaticTransposeLocalTensor": 0.02319931983947754, + "SundaISel": 0.07143282890319824, + "TCTransform": 0.001344442367553711, + "TensorInitialization": 0.020877599716186523, + "TensorOpSimplifier": 0.0060787200927734375, + "TensorOpTransform": 0.03784608840942383, + "TensorizerLegalizationPass": 5.0000002374872565e-05, + "TileCCOps": 0.005100250244140625, + "TilingProfiler": 0.02941441535949707, + "TransformConvOp": 0.005896091461181641, + "TritiumFusion": 0.08978962898254395, + "ValueNumbering": 0.0032432079315185547, + "VectorizeDMA": 0.005987644195556641, + "VectorizeMatMult": 0.019278526306152344, + "VerifySupportedOps": 3.600000127335079e-05, + "WeightCoalescing": 0.014359712600708008, + "ZeroSizeTensorElimination": 0.00021028518676757813, + "algsimp": 0.001816999982111156, + "batchnorm_expander": 3.5000000934815034e-05, + "boundary-marker-removal": 1.2999998943996616e-05, + "call-inliner": 0.00031099998159334064, + "canonicalize-boundary-marker": 1.5999999959603883e-05, + "collective-stream-id-checker": 7.60000039008446e-05, + "comparison-expander": 0.0004780000017490238, + "computation-deduplicator": 5.699999746866524e-05, + "config-lowering": 0.00012000000424450263, + "constant-statistics": 0.00038899999344721437, + "constant_folding": 0.00016199999663513154, + "cse": 3.5000000934815034e-05, + "dce": 4.3000000005122274e-05, + "dot_decomposer": 0.0010089999996125698, + "dynamic-slice-transpose": 1.2000000424450263e-05, + "eliminate-redundant-compare": 0.00013299999409355223, + "emit-offloaded-dropout": 3.7000001611886546e-05, + "flatten-call-graph": 0.0008110000053420663, + "fuse-send-recv": 6.600000051548705e-05, + "hilo-conditional-to-select": 1.2999999853491317e-05, + "hilo::LegalizeAlias": 1.1000000085914508e-05, + "hilo::NeuronInstCombine": 0.00019799999427050352, + "hilo::NeuronOpFusion": 3.7000001611886546e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 4.8000001697801054e-05, + "hilo::ScheduleFusion": 3.999999989900971e-06, + "hilo::SixtyFourHack": 6.800000119255856e-05, + "hilo::VerifyAliasing": 3.999999989900971e-06, + "hlo-mac-count": 0.012529000639915466, + "instruction-histogram": 0.0008679999737069011, + "io-con-pipe-begin": 6.000000212225132e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0010789999505504966, + "io-statistics": 3.899999865097925e-05, + "legalize-ccops-for-tensorizer": 3.000000106112566e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 9.999999747378752e-06, + "map-inline": 0.000813000020571053, + "metadata-naming": 4.900000203633681e-05, + "mlir::detail::OpToOpPassAdaptor": 7.60000039008446e-05, + "mlir::hlo::MhloToPyPenguin": 0.008621999993920326, + "mlir::mhlo::LowerComplexExtraPass": 0.00021299999207258224, + "mlir::mhlo::LowerComplexPass": 0.0003549999964889139, + "native-to-custom-softmax": 0.00033000000985339284, + "native-to-custom-softmax-dx": 0.0016530000139027834, + "neuron-hlo-verifier": 0.011901999823749065, + "operand_upcaster": 5.299999611452222e-05, + "opt-barrier-removal": 0.0003209999995306134, + "post-par-pipe-begin": 0.0003220000071451068, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0015040000434964895, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.06566499918699265, + "replace-minimum-constant": 0.0003129999968223274, + "reshape-mover": 6.000000212225132e-05, + "simplify-concat": 0.00011900000390596688, + "simplify-while-loops": 5.900000178371556e-05, + "transform-variadic-reduce": 6.399999983841553e-05, + "tuple-simplifier": 0.00015100000018719584, + "unpack-nested-aws-ntwsr": 0.00023299999884329736, + "unroll-while-loop": 9.000000318337698e-06, + "zero_sized_hlo_elimination": 0.0007510000141337514 + }, + "hilo": { + "ConstantSize": 1843839.0, + "HloInputCount": 371.0, + "HloMacCount": 53843722240.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910920192.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 915302528.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 22664.0, + "StaticProfiler::AifUb": 229.36119079589844, + "StaticProfiler::ArithmeticIntensityTensorizer": 194.92408752441406, + "StaticProfiler::AverageDmaLength": 2258.685546875, + "StaticProfiler::DDRTransferBytes": 420482080.0, + "StaticProfiler::InternalTransferBytes": 338614048.0, + "StaticProfiler::LoadExpanded": 118366.0, + "StaticProfiler::StoreExpanded": 4458.0, + "StaticProfiler::TotalDMAExpanded": 122824.0, + "StaticProfiler::TotalDynamicInstancesCount": 27423.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 26972.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 11808.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 9889.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 6.0, + "TilingProfiler::SimdInstructionsAfterTiling": 165.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.0016659999964758754, + "call-inliner": 0.0002859999949578196, + "collective-stream-id-checker": 6.600000051548705e-05, + "comparison-expander": 0.00045900000259280205, + "constant-statistics": 0.00038899999344721437, + "constant_folding": 0.00014000000373926014, + "dce": 3.9999998989515007e-05, + "dot_decomposer": 0.0010089999996125698, + "eliminate-redundant-compare": 0.00012399999832268804, + "flatten-call-graph": 0.0007849999819882214, + "hlo-mac-count": 0.007579999975860119, + "instruction-histogram": 0.0008679999737069011, + "io-con-pipe-begin": 6.000000212225132e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0010789999505504966, + "io-statistics": 3.899999865097925e-05, + "map-inline": 0.0007789999945089221, + "native-to-custom-softmax": 0.000311999989207834, + "native-to-custom-softmax-dx": 0.00039400000241585076, + "neuron-hlo-verifier": 0.01071999967098236, + "opt-barrier-removal": 0.0003209999995306134, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.06566499918699265, + "replace-minimum-constant": 0.00029399999766610563, + "reshape-mover": 5.199999941396527e-05, + "simplify-while-loops": 5.2999999752501026e-05, + "tuple-simplifier": 0.00013800000306218863, + "unpack-nested-aws-ntwsr": 0.0002209999947808683, + "unroll-while-loop": 9.000000318337698e-06, + "zero_sized_hlo_elimination": 0.0007510000141337514 + } + }, + "attention_isa_kernel": { + "compiletime": { + "CoalesceCCOp": 0.00023293495178222656, + "DMALocalityOpt": 0.0001811981201171875, + "DMAProfiler": 0.00021409988403320313, + "DataStreaming": 0.00021123886108398438, + "DoNothing": 0.00015926361083984375, + "ExpandISAMacro": 0.00025653839111328125, + "FactorizeBlkDims": 0.0004589557647705078, + "InferPSumTensor": 0.001004934310913086, + "InferSharedMemLoc": 0.0005850791931152344, + "InsertCoreBarrier": 0.00032901763916015625, + "LateLegalizeInst": 0.000202178955078125, + "LateNeuronInstComb": 0.000457763671875, + "LegalizeSundaAccess": 0.000244140625, + "LegalizeType": 0.00035119056701660156, + "LowerBroadcast": 0.0002529621124267578, + "LowerIntrinsics": 0.00025534629821777344, + "LowerTranspose": 0.00019860267639160156, + "NeuronInstComb": 0.0004410743713378906, + "NeuronLICM": 0.00022935867309570313, + "NeuronSimplifyPredicates": 0.00023698806762695313, + "NeuronValueNumbering": 0.00019621849060058594, + "SFKVectorizer": 0.0017054080963134766, + "SimpleAllReduceTiling": 0.00020575523376464844, + "SimplifyNeuronTensor": 0.00058746337890625, + "SpillPSum": 0.0008275508880615234, + "WeightCoalescing": 0.0002827644348144531 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0004239082336425781, + "DMALocalityOpt": 0.0008606910705566406, + "DMAProfiler": 0.0012273788452148438, + "DataStreaming": 0.0004677772521972656, + "DoNothing": 0.0020771026611328125, + "ExpandISAMacro": 0.0009121894836425781, + "FactorizeBlkDims": 0.0007412433624267578, + "InferPSumTensor": 0.0011811256408691406, + "InferSharedMemLoc": 0.00045990943908691406, + "InsertCoreBarrier": 0.00042891502380371094, + "LateLegalizeInst": 0.00063323974609375, + "LateNeuronInstComb": 0.0013093948364257813, + "LegalizeSundaAccess": 0.0025353431701660156, + "LegalizeType": 0.001573801040649414, + "LowerBroadcast": 0.0004336833953857422, + "LowerIntrinsics": 0.0003495216369628906, + "LowerTranspose": 0.00044226646423339844, + "NeuronInstComb": 0.007911205291748047, + "NeuronLICM": 0.0006246566772460938, + "NeuronSimplifyPredicates": 0.006840705871582031, + "NeuronValueNumbering": 0.0007255077362060547, + "SFKVectorizer": 0.008939266204833984, + "SimpleAllReduceTiling": 0.0003476142883300781, + "SimplifyNeuronTensor": 0.0009677410125732422, + "SpillPSum": 0.0031452178955078125, + "WeightCoalescing": 0.000408172607421875 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.4000000192027073e-05, + "Canonicalizer": 0.0002680000034160912, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 1.4000000192027073e-05, + "MemcastMotion": 9.000000318337698e-06, + "PenguinizeFunctions": 1.4999999621068127e-05, + "PruneFunctions": 1.2999999853491317e-05, + "RemoveOptimizationBarriers": 7.000000096013537e-06, + "ScatterMotion": 1.8000000636675395e-05, + "TensorizerLegalizationPass": 2.700000004551839e-05, + "VerifySupportedOps": 1.4000000192027073e-05, + "algsimp": 4.8000001697801054e-05, + "batchnorm_expander": 1.2999999853491317e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.000000096013537e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 7.000000096013537e-06, + "computation-deduplicator": 1.700000029813964e-05, + "config-lowering": 3.899999865097925e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 1.1000000085914508e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.2000000424450263e-05, + "flatten-call-graph": 7.999999979801942e-06, + "fuse-send-recv": 2.8000000384054147e-05, + "hilo-conditional-to-select": 3.999999989900971e-06, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 7.79999973019585e-05, + "hilo::NeuronOpFusion": 1.4000000192027073e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.8999999156221747e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.1000000085914508e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 9.999999747378752e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.2000000424450263e-05, + "metadata-naming": 1.4000000192027073e-05, + "mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05, + "mlir::hlo::MhloToPyPenguin": 0.001617999980226159, + "mlir::mhlo::LowerComplexExtraPass": 6.70000008540228e-05, + "mlir::mhlo::LowerComplexPass": 0.00011800000356743112, + "native-to-custom-softmax": 7.000000096013537e-06, + "native-to-custom-softmax-dx": 0.001218999968841672, + "neuron-hlo-verifier": 0.0004619999963324517, + "operand_upcaster": 2.099999983329326e-05, + "post-par-pipe-begin": 0.00031800000579096377, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.00047400000039488077, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 3.7000001611886546e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.999999979801942e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 36.6374397277832, + "ConstantSize": 1843839.0, + "HloInputCount": 371.0, + "HloMacCount": 7516192768.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910920192.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 410301216.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.10170578956604004, + "AffinePredicateResolution": 0.002114534378051758, + "AliasDependencyElimination": 0.0003135204315185547, + "AliasDependencyInduction": 0.008873462677001953, + "AliasDependencyReset": 0.08848953247070313, + "BFComputeCutting": 0.0046901702880859375, + "BirCodeGenLoop": 0.07164216041564941, + "CCOpFusion": 0.03796195983886719, + "CanonicalizeDAGForPGTiling": 0.004980564117431641, + "CanonicalizeIR": 0.0069043636322021484, + "CoalesceCCOp": 0.025182723999023438, + "CommuteConcat": 0.0019867420196533203, + "DMALocalityOpt": 0.0017561912536621094, + "DMAProfiler": 0.015140295028686523, + "DMATilingProfiler": 0.016626596450805664, + "DataLocalityOpt": 0.22760343551635742, + "DataStreaming": 0.010300159454345703, + "DeConcat": 0.0027208328247070313, + "DeadCodeElimination": 0.0024912357330322266, + "DeadStoreElimination": 0.0712437629699707, + "DelinearIndices": 0.016620635986328125, + "Delinearization": 0.009757280349731445, + "DelinearizeSPMD": 0.031106233596801758, + "DoNothing": 0.00010442733764648438, + "DramToDramTranspose": 0.015790462493896484, + "DumpGraphAndMetadata": 0.009348392486572266, + "EliminateDivs": 0.0055081844329833984, + "ExpandBatchNorm": 0.002715587615966797, + "ExpandISAMacro": 0.006904125213623047, + "FactorizeBlkDims": 0.02294635772705078, + "FactorizeThreadAxesInFreeDims": 0.004876136779785156, + "FlattenMacroLoop": 0.014545440673828125, + "GenericAccessSimplifier": 0.0014882087707519531, + "InferInitValue": 0.07265543937683105, + "InferIntrinsicOnCC": 0.016221046447753906, + "InferNeuronTensor": 0.06634330749511719, + "InferNonlocalTensors": 0.310718297958374, + "InferPSumTensor": 0.1104276180267334, + "InferShardAxis": 0.6379494667053223, + "InferSharedMemLoc": 0.007468461990356445, + "InlineNativeKernels": 0.008686304092407227, + "InsertCoreBarrier": 0.013060331344604492, + "InsertIOTransposes": 0.0500941276550293, + "InsertImplicitShardAxisBeforeISel": 0.013952255249023438, + "InsertLocalTransposes": 0.011726140975952148, + "InsertOffloadedTransposes": 0.015027046203613281, + "LICM": 0.009333610534667969, + "LateLegalizeInst": 0.02084517478942871, + "LateLegalizePostSplit": 0.006055116653442383, + "LateLowerReshapeOp": 0.0010623931884765625, + "LateLowerTensorOp": 0.005917787551879883, + "LateNeuronInstComb": 0.0374608039855957, + "LayoutPreprocessing": 0.11253118515014648, + "LayoutPreprocessingAndAnalysis": 0.17174959182739258, + "LayoutRequirementAnalysis": 0.01859116554260254, + "LegalizeCCOpLayout": 0.008987903594970703, + "LegalizeOpLevelAlias": 0.0018634796142578125, + "LegalizePartitionReduce": 0.0028128623962402344, + "LegalizeSundaAccess": 0.0760490894317627, + "LegalizeSundaMacro": 0.04249215126037598, + "LegalizeType": 0.017363786697387695, + "LocalLayoutOpt": 0.030303478240966797, + "LoopFusion": 0.015121221542358398, + "LoopSplitting": 0.001684427261352539, + "LowerBroadcast": 0.004286289215087891, + "LowerCCOpBlockAxis": 0.011670112609863281, + "LowerComplexBroadcast": 0.009485006332397461, + "LowerIntrinsics": 0.06814241409301758, + "LowerShardAxis": 0.01289224624633789, + "LowerTensorOp": 0.012324810028076172, + "LowerToSendRecv": 0.01944112777709961, + "LowerTranspose": 0.024444580078125, + "MacroGeneration": 0.12030863761901855, + "MaskPropagation": 0.0041234493255615234, + "MemcpyElimination": 0.11655545234680176, + "MutateDataType": 0.006365299224853516, + "NeuronAliasDependencyInduction": 0.0008358955383300781, + "NeuronAliasDependencyReset": 0.0208890438079834, + "NeuronInstComb": 0.012987852096557617, + "NeuronLICM": 0.03186321258544922, + "NeuronLoopFusion": 0.039856910705566406, + "NeuronLoopInterchange": 0.0034656524658203125, + "NeuronSimplifier": 0.04315042495727539, + "NeuronSimplifyPredicates": 0.005248546600341797, + "NeuronValueNumbering": 0.017512798309326172, + "OptimizeAliasedCopyChain": 0.0023038387298583984, + "OptimizeNKIKernels": 0.3315870761871338, + "PAGLayoutOpt": 0.6959309577941895, + "PComputeCutting": 0.02900981903076172, + "PGLayoutTilingPipeline": 2.8589253425598145, + "PGTiling": 0.4929697513580322, + "PadElimination": 0.0008306503295898438, + "ParAxesAnnotation": 0.6449503898620605, + "PartialLoopFusion": 0.04073286056518555, + "PartialSimdFusion": 0.04506206512451172, + "PerfectLoopNest": 0.003442049026489258, + "RecognizeOpIdiom": 0.01386570930480957, + "Recompute": 0.0005090236663818359, + "RelaxPredicates": 0.007751941680908203, + "Rematerialization": 0.0035130977630615234, + "RemoveShardedPartitionAxes": 0.042932987213134766, + "ReshapeWeights": 0.005467653274536133, + "ResolveAccessConflict": 0.007354259490966797, + "ResolveComplicatePredicates": 0.0022590160369873047, + "RewriteReplicationMatmul": 0.0024857521057128906, + "RewriteWeights": 0.007905960083007813, + "SFKVectorizer": 0.45865941047668457, + "ShardingPropagationAnalysis": 0.015976905822753906, + "SimpleAllReduceTiling": 0.004487752914428711, + "Simplifier": 0.01264333724975586, + "SimplifyMacroPredicates": 0.010998964309692383, + "SimplifyNeuronTensor": 0.020704269409179688, + "SimplifySlice": 0.0029506683349609375, + "SimplifyTensor": 0.024234533309936523, + "SpillPSum": 0.03745222091674805, + "SplitAPUnionSets": 0.0402374267578125, + "SplitAccGrp": 0.0030994415283203125, + "StaticProfiler": 0.007781982421875, + "StaticTransposeLocalTensor": 0.015400409698486328, + "SundaISel": 0.15909790992736816, + "TCTransform": 0.0024313926696777344, + "TensorInitialization": 0.00689244270324707, + "TensorOpSimplifier": 0.009465932846069336, + "TensorOpTransform": 0.05043935775756836, + "TileCCOps": 0.01146245002746582, + "TilingProfiler": 0.030185699462890625, + "TransformConvOp": 0.003003835678100586, + "TritiumFusion": 0.07740235328674316, + "ValueNumbering": 0.006630659103393555, + "VectorizeDMA": 0.006995201110839844, + "VectorizeMatMult": 0.019536495208740234, + "WeightCoalescing": 0.007775783538818359, + "ZeroSizeTensorElimination": 0.0001773834228515625 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 9885.0, + "StaticProfiler::AifUb": 33.7130126953125, + "StaticProfiler::ArithmeticIntensityTensorizer": 285.20709228515625, + "StaticProfiler::AverageDmaLength": 1479.2880859375, + "StaticProfiler::AverageFractalPeUtilization": 99.77941131591797, + "StaticProfiler::AveragePartitionUtilization": 99.22618865966797, + "StaticProfiler::AveragePeUtilization": 99.2345962524414, + "StaticProfiler::DDRTransferBytes": 55208456.0, + "StaticProfiler::InternalTransferBytes": 47980544.0, + "StaticProfiler::LoadExpanded": 15885.0, + "StaticProfiler::LocalizationEfficiency": 845.9852294921875, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1554.64208984375, + "StaticProfiler::StoreExpanded": 10241.0, + "StaticProfiler::TotalDMAExpanded": 26126.0, + "StaticProfiler::TotalDynamicInstancesCount": 2424.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 2417.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 80.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 776.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 448.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 128.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 320.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 236.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.11948776245117188, + "AffinePredicateResolution": 0.0018799304962158203, + "AliasDependencyElimination": 0.00021576881408691406, + "AliasDependencyInduction": 0.007300615310668945, + "AliasDependencyReset": 0.025965213775634766, + "BFComputeCutting": 0.0029859542846679688, + "BirCodeGenLoop": 0.0455019474029541, + "CCOpFusion": 0.04734611511230469, + "CanonicalizeDAGForPGTiling": 0.022237777709960938, + "CanonicalizeIR": 0.002727985382080078, + "CoalesceCCOp": 0.02167034149169922, + "CommuteConcat": 0.003200054168701172, + "DMALocalityOpt": 0.00392460823059082, + "DMAProfiler": 0.009830236434936523, + "DMATilingProfiler": 0.025944948196411133, + "DataLocalityOpt": 0.3604612350463867, + "DataStreaming": 0.009065628051757813, + "DeConcat": 0.0069577693939208984, + "DeadCodeElimination": 0.011698722839355469, + "DeadStoreElimination": 0.06011176109313965, + "DelinearIndices": 0.020532608032226563, + "Delinearization": 0.00762939453125, + "DelinearizeSPMD": 0.03405618667602539, + "DoNothing": 8.106231689453125e-05, + "DramToDramTranspose": 0.01855611801147461, + "DumpGraphAndMetadata": 0.008964061737060547, + "EliminateDivs": 0.0031299591064453125, + "ExpandBatchNorm": 0.0030705928802490234, + "ExpandISAMacro": 0.006265163421630859, + "FactorizeBlkDims": 0.03638315200805664, + "FactorizeThreadAxesInFreeDims": 0.008359670639038086, + "FlattenMacroLoop": 0.012061595916748047, + "GenericAccessSimplifier": 0.0030562877655029297, + "InferInitValue": 0.08994674682617188, + "InferIntrinsicOnCC": 0.024573802947998047, + "InferNeuronTensor": 0.1031036376953125, + "InferNonlocalTensors": 0.05871725082397461, + "InferPSumTensor": 0.06618380546569824, + "InferShardAxis": 0.7525274753570557, + "InferSharedMemLoc": 0.0068051815032958984, + "InlineNativeKernels": 0.005843400955200195, + "InsertCoreBarrier": 0.008070230484008789, + "InsertIOTransposes": 0.04006528854370117, + "InsertImplicitShardAxisBeforeISel": 0.01073002815246582, + "InsertLocalTransposes": 0.014261007308959961, + "InsertOffloadedTransposes": 0.03949117660522461, + "LICM": 0.009208917617797852, + "LateLegalizeInst": 0.029766082763671875, + "LateLegalizePostSplit": 0.005662679672241211, + "LateLowerReshapeOp": 0.0074732303619384766, + "LateLowerTensorOp": 0.003675222396850586, + "LateNeuronInstComb": 0.010900020599365234, + "LayoutPreprocessing": 0.12459802627563477, + "LayoutPreprocessingAndAnalysis": 0.2370927333831787, + "LayoutRequirementAnalysis": 0.02673649787902832, + "LegalizeCCOpLayout": 0.001771688461303711, + "LegalizeOpLevelAlias": 0.001964569091796875, + "LegalizePartitionReduce": 0.0026857852935791016, + "LegalizeSundaAccess": 0.024449825286865234, + "LegalizeSundaMacro": 0.031160593032836914, + "LegalizeType": 0.01265263557434082, + "LocalLayoutOpt": 0.13158392906188965, + "LoopFusion": 0.008500337600708008, + "LoopSplitting": 0.007683753967285156, + "LowerBroadcast": 0.0029337406158447266, + "LowerCCOpBlockAxis": 0.019019126892089844, + "LowerComplexBroadcast": 0.0050733089447021484, + "LowerIntrinsics": 0.045258283615112305, + "LowerShardAxis": 0.010171175003051758, + "LowerTensorOp": 0.04014849662780762, + "LowerToSendRecv": 0.006317615509033203, + "LowerTranspose": 0.02257823944091797, + "MacroGeneration": 0.1289076805114746, + "MaskPropagation": 0.007184505462646484, + "MemcpyElimination": 0.13024330139160156, + "MutateDataType": 0.0023887157440185547, + "NeuronAliasDependencyInduction": 0.0008273124694824219, + "NeuronAliasDependencyReset": 0.023006439208984375, + "NeuronInstComb": 0.02357006072998047, + "NeuronLICM": 0.016632556915283203, + "NeuronLoopFusion": 0.05176591873168945, + "NeuronLoopInterchange": 0.003633737564086914, + "NeuronSimplifier": 0.055544376373291016, + "NeuronSimplifyPredicates": 0.0042285919189453125, + "NeuronValueNumbering": 0.007681369781494141, + "OptimizeAliasedCopyChain": 0.0018992424011230469, + "OptimizeNKIKernels": 0.42712831497192383, + "PAGLayoutOpt": 0.40447092056274414, + "PComputeCutting": 0.02052617073059082, + "PGLayoutTilingPipeline": 2.5240347385406494, + "PGTiling": 0.4373018741607666, + "PadElimination": 0.0004992485046386719, + "ParAxesAnnotation": 0.3364219665527344, + "PartialLoopFusion": 0.04578566551208496, + "PartialSimdFusion": 0.07974457740783691, + "PerfectLoopNest": 0.006705045700073242, + "RecognizeOpIdiom": 0.007408857345581055, + "Recompute": 0.0003921985626220703, + "RelaxPredicates": 0.004956483840942383, + "Rematerialization": 0.00407719612121582, + "RemoveShardedPartitionAxes": 0.03296494483947754, + "ReshapeWeights": 0.0016734600067138672, + "ResolveAccessConflict": 0.005868196487426758, + "ResolveComplicatePredicates": 0.0019488334655761719, + "RewriteReplicationMatmul": 0.002888917922973633, + "RewriteWeights": 0.0121307373046875, + "SFKVectorizer": 0.3227095603942871, + "ShardingPropagationAnalysis": 0.030770540237426758, + "SimpleAllReduceTiling": 0.005700588226318359, + "Simplifier": 0.006751298904418945, + "SimplifyMacroPredicates": 0.0224151611328125, + "SimplifyNeuronTensor": 0.026612043380737305, + "SimplifySlice": 0.0016014575958251953, + "SimplifyTensor": 0.014640331268310547, + "SpillPSum": 0.03543543815612793, + "SplitAPUnionSets": 0.04225468635559082, + "SplitAccGrp": 0.0025916099548339844, + "StaticProfiler": 0.004286527633666992, + "StaticTransposeLocalTensor": 0.01450037956237793, + "SundaISel": 0.09066033363342285, + "TCTransform": 0.001735687255859375, + "TensorInitialization": 0.005040168762207031, + "TensorOpSimplifier": 0.009763479232788086, + "TensorOpTransform": 0.037050485610961914, + "TileCCOps": 0.007235288619995117, + "TilingProfiler": 0.022336721420288086, + "TransformConvOp": 0.003210783004760742, + "TritiumFusion": 0.1834256649017334, + "ValueNumbering": 0.007995128631591797, + "VectorizeDMA": 0.009528160095214844, + "VectorizeMatMult": 0.04178977012634277, + "WeightCoalescing": 0.0037496089935302734, + "ZeroSizeTensorElimination": 0.00022602081298828125 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 12395.0, + "StaticProfiler::AifUb": 272.9356689453125, + "StaticProfiler::ArithmeticIntensityTensorizer": 394.9350280761719, + "StaticProfiler::AverageDmaLength": 1993.7806396484375, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.59767150878906, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 139593728.0, + "StaticProfiler::InternalTransferBytes": 38535168.0, + "StaticProfiler::LoadExpanded": 49793.0, + "StaticProfiler::LocalizationEfficiency": 144.69894409179688, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 179.81776428222656, + "StaticProfiler::StoreExpanded": 11265.0, + "StaticProfiler::TotalDMAExpanded": 61058.0, + "StaticProfiler::TotalDynamicInstancesCount": 4975.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 4975.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 64.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 3072.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 496.0, + "TilingProfiler::PfTransposeInstructionsForIo": 144.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 96.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 256.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 275.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.057534217834472656, + "AffinePredicateResolution": 0.0009605884552001953, + "AliasDependencyElimination": 0.00025153160095214844, + "AliasDependencyInduction": 0.006276607513427734, + "AliasDependencyReset": 0.027743816375732422, + "BFComputeCutting": 0.0031321048736572266, + "BirCodeGenLoop": 0.5169932842254639, + "CCOpFusion": 0.05496716499328613, + "CanonicalizeDAGForPGTiling": 0.010706663131713867, + "CanonicalizeIR": 0.00154876708984375, + "CoalesceCCOp": 0.020469188690185547, + "CommuteConcat": 0.001708984375, + "DMALocalityOpt": 0.0024063587188720703, + "DMAProfiler": 0.021881103515625, + "DMATilingProfiler": 0.011522531509399414, + "DataLocalityOpt": 0.28015780448913574, + "DataStreaming": 0.018134355545043945, + "DeConcat": 0.002462148666381836, + "DeadCodeElimination": 0.0021996498107910156, + "DeadStoreElimination": 0.007483243942260742, + "DelinearIndices": 0.008810281753540039, + "Delinearization": 0.009731292724609375, + "DelinearizeSPMD": 0.04425859451293945, + "DoNothing": 6.67572021484375e-05, + "DramToDramTranspose": 0.012907743453979492, + "DumpGraphAndMetadata": 0.07597684860229492, + "EliminateDivs": 0.0021903514862060547, + "ExpandBatchNorm": 0.001527547836303711, + "ExpandISAMacro": 0.015442609786987305, + "FactorizeBlkDims": 0.020684003829956055, + "FactorizeThreadAxesInFreeDims": 0.003031015396118164, + "FlattenMacroLoop": 0.004990577697753906, + "GenericAccessSimplifier": 0.0007598400115966797, + "InferInitValue": 0.10130023956298828, + "InferIntrinsicOnCC": 0.007919549942016602, + "InferNeuronTensor": 0.05837249755859375, + "InferNonlocalTensors": 0.05706453323364258, + "InferPSumTensor": 0.04483771324157715, + "InferShardAxis": 0.4604020118713379, + "InferSharedMemLoc": 0.04048299789428711, + "InlineNativeKernels": 0.006569623947143555, + "InsertCoreBarrier": 0.010969161987304688, + "InsertIOTransposes": 0.0684211254119873, + "InsertImplicitShardAxisBeforeISel": 0.01549673080444336, + "InsertLocalTransposes": 0.022176742553710938, + "InsertOffloadedTransposes": 0.0181121826171875, + "LICM": 0.007555484771728516, + "LateLegalizeInst": 0.013030767440795898, + "LateLegalizePostSplit": 0.01993083953857422, + "LateLowerReshapeOp": 0.0016782283782958984, + "LateLowerTensorOp": 0.0021178722381591797, + "LateNeuronInstComb": 0.03255581855773926, + "LayoutPreprocessing": 0.10170960426330566, + "LayoutPreprocessingAndAnalysis": 0.23344039916992188, + "LayoutRequirementAnalysis": 0.032952308654785156, + "LegalizeCCOpLayout": 0.002583742141723633, + "LegalizeOpLevelAlias": 0.002170562744140625, + "LegalizePartitionReduce": 0.0025551319122314453, + "LegalizeSundaAccess": 0.08088016510009766, + "LegalizeSundaMacro": 0.04086017608642578, + "LegalizeType": 0.009904623031616211, + "LocalLayoutOpt": 0.023218154907226563, + "LoopFusion": 0.005990266799926758, + "LoopSplitting": 0.0007989406585693359, + "LowerBroadcast": 0.0051610469818115234, + "LowerCCOpBlockAxis": 0.007201671600341797, + "LowerComplexBroadcast": 0.00890207290649414, + "LowerIntrinsics": 0.09793353080749512, + "LowerShardAxis": 0.023633956909179688, + "LowerTensorOp": 0.03027796745300293, + "LowerToSendRecv": 0.027859210968017578, + "LowerTranspose": 0.0216217041015625, + "MacroGeneration": 0.12761783599853516, + "MaskPropagation": 0.01400303840637207, + "MemcpyElimination": 0.03596854209899902, + "MutateDataType": 0.0020971298217773438, + "NeuronAliasDependencyInduction": 0.0019202232360839844, + "NeuronAliasDependencyReset": 0.027405738830566406, + "NeuronInstComb": 0.024044275283813477, + "NeuronLICM": 0.027622222900390625, + "NeuronLoopFusion": 0.06255030632019043, + "NeuronLoopInterchange": 0.002681255340576172, + "NeuronSimplifier": 0.01907205581665039, + "NeuronSimplifyPredicates": 0.029021024703979492, + "NeuronValueNumbering": 0.011119604110717773, + "OptimizeAliasedCopyChain": 0.0005273818969726563, + "OptimizeNKIKernels": 4.391921043395996, + "PAGLayoutOpt": 0.16190624237060547, + "PComputeCutting": 0.016373872756958008, + "PGLayoutTilingPipeline": 2.0541465282440186, + "PGTiling": 0.3632845878601074, + "PadElimination": 0.0006501674652099609, + "ParAxesAnnotation": 0.08851456642150879, + "PartialLoopFusion": 0.05034661293029785, + "PartialSimdFusion": 0.014182329177856445, + "PerfectLoopNest": 0.0036270618438720703, + "RecognizeOpIdiom": 0.007064342498779297, + "Recompute": 0.00046062469482421875, + "RelaxPredicates": 0.02269601821899414, + "Rematerialization": 0.0019779205322265625, + "RemoveShardedPartitionAxes": 0.014830350875854492, + "ReshapeWeights": 0.0021474361419677734, + "ResolveAccessConflict": 0.007428646087646484, + "ResolveComplicatePredicates": 0.001834869384765625, + "RewriteReplicationMatmul": 0.006201982498168945, + "RewriteWeights": 0.004793643951416016, + "SFKVectorizer": 0.2884867191314697, + "ShardingPropagationAnalysis": 0.2801475524902344, + "SimpleAllReduceTiling": 0.008132696151733398, + "Simplifier": 0.003251314163208008, + "SimplifyMacroPredicates": 0.03280019760131836, + "SimplifyNeuronTensor": 0.04464459419250488, + "SimplifySlice": 0.0008628368377685547, + "SimplifyTensor": 0.014911413192749023, + "SpillPSum": 0.03145956993103027, + "SplitAPUnionSets": 0.09714126586914063, + "SplitAccGrp": 0.006166219711303711, + "StaticProfiler": 0.021403789520263672, + "StaticTransposeLocalTensor": 0.02319931983947754, + "SundaISel": 0.07143282890319824, + "TCTransform": 0.001344442367553711, + "TensorInitialization": 0.020877599716186523, + "TensorOpSimplifier": 0.0060787200927734375, + "TensorOpTransform": 0.03784608840942383, + "TileCCOps": 0.005100250244140625, + "TilingProfiler": 0.02941441535949707, + "TransformConvOp": 0.005896091461181641, + "TritiumFusion": 0.08978962898254395, + "ValueNumbering": 0.0032432079315185547, + "VectorizeDMA": 0.005987644195556641, + "VectorizeMatMult": 0.019278526306152344, + "WeightCoalescing": 0.004654884338378906, + "ZeroSizeTensorElimination": 0.00021028518676757813 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 22664.0, + "StaticProfiler::AifUb": 229.36119079589844, + "StaticProfiler::ArithmeticIntensityTensorizer": 194.92408752441406, + "StaticProfiler::AverageDmaLength": 2258.685546875, + "StaticProfiler::AverageFractalPeUtilization": 98.80319213867188, + "StaticProfiler::AveragePartitionUtilization": 94.51075744628906, + "StaticProfiler::AveragePeUtilization": 96.83863067626953, + "StaticProfiler::DDRTransferBytes": 420482080.0, + "StaticProfiler::InternalTransferBytes": 338614048.0, + "StaticProfiler::LoadExpanded": 118366.0, + "StaticProfiler::LocalizationEfficiency": 84.98564147949219, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 89.59233093261719, + "StaticProfiler::StoreExpanded": 4458.0, + "StaticProfiler::TotalDMAExpanded": 122824.0, + "StaticProfiler::TotalDynamicInstancesCount": 27423.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 26972.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 11808.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 9889.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 6.0, + "TilingProfiler::SimdInstructionsAfterTiling": 165.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 7.000000096013537e-06, + "CanonicalizeForTensorizer": 1.1000000085914508e-05, + "Canonicalizer": 0.00023700000019744039, + "HoistCompute": 4.999999873689376e-06, + "IdentifyCrossPassTensors": 1.2999999853491317e-05, + "MemcastMotion": 7.999999979801942e-06, + "PenguinizeFunctions": 1.2000000424450263e-05, + "PruneFunctions": 1.700000029813964e-05, + "RemoveOptimizationBarriers": 2.300000051036477e-05, + "ScatterMotion": 1.700000029813964e-05, + "TensorizerLegalizationPass": 1.5999999959603883e-05, + "VerifySupportedOps": 9.999999747378752e-06, + "algsimp": 4.70000013592653e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 4.999999873689376e-06, + "call-inliner": 7.999999979801942e-06, + "canonicalize-boundary-marker": 6.000000212225132e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 7.000000096013537e-06, + "computation-deduplicator": 1.8999999156221747e-05, + "config-lowering": 3.7000001611886546e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 9.999999747378752e-06, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.2000000424450263e-05, + "flatten-call-graph": 7.000000096013537e-06, + "fuse-send-recv": 1.8999999156221747e-05, + "hilo-conditional-to-select": 3.999999989900971e-06, + "hilo::LegalizeAlias": 3.999999989900971e-06, + "hilo::NeuronInstCombine": 5.0999999075429514e-05, + "hilo::NeuronOpFusion": 1.700000029813964e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.2999999853491317e-05, + "hilo::ScheduleFusion": 1.9999999949504854e-06, + "hilo::SixtyFourHack": 1.2999999853491317e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 8.199999865610152e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 1.8000000636675395e-05, + "mlir::detail::OpToOpPassAdaptor": 2.5999999706982635e-05, + "mlir::hlo::MhloToPyPenguin": 0.0009560000034980476, + "mlir::mhlo::LowerComplexExtraPass": 7.000000186963007e-05, + "mlir::mhlo::LowerComplexPass": 0.00014000000373926014, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.8000000636675395e-05, + "neuron-hlo-verifier": 0.0003600000054575503, + "operand_upcaster": 1.4999999621068127e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0004780000017490238, + "replace-minimum-constant": 3.999999989900971e-06, + "reshape-mover": 1.9999999949504854e-06, + "simplify-concat": 3.9999998989515007e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 9.000000318337698e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 374.9828186035156, + "HloMacCount": 26843545600.0, + "Traffic": 143172128.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 1.1000000085914508e-05, + "CanonicalizeForTensorizer": 1.2000000424450263e-05, + "Canonicalizer": 0.0002899999963119626, + "HoistCompute": 3.999999989900971e-06, + "IdentifyCrossPassTensors": 2.300000051036477e-05, + "MemcastMotion": 1.1000000085914508e-05, + "PenguinizeFunctions": 1.2000000424450263e-05, + "PruneFunctions": 7.999999979801942e-06, + "RemoveOptimizationBarriers": 1.4000000192027073e-05, + "ScatterMotion": 3.000000106112566e-06, + "TensorizerLegalizationPass": 7.000000096013537e-06, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 5.6000000768108293e-05, + "batchnorm_expander": 9.999999747378752e-06, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 9.999999747378752e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 2.099999983329326e-05, + "config-lowering": 4.400000034365803e-05, + "constant_folding": 7.999999979801942e-06, + "cse": 1.4000000192027073e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 1.2999999853491317e-05, + "flatten-call-graph": 1.1000000085914508e-05, + "fuse-send-recv": 1.8999999156221747e-05, + "hilo-conditional-to-select": 4.999999873689376e-06, + "hilo::LegalizeAlias": 1.9999999949504854e-06, + "hilo::NeuronInstCombine": 6.900000153109431e-05, + "hilo::NeuronOpFusion": 6.000000212225132e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.5999999959603883e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 4.400000034365803e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.004767000209540129, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 2.9000000722589903e-05, + "mlir::hlo::MhloToPyPenguin": 0.006047999951988459, + "mlir::mhlo::LowerComplexExtraPass": 7.599999662488699e-05, + "mlir::mhlo::LowerComplexPass": 9.699999645818025e-05, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 2.2000000171829015e-05, + "neuron-hlo-verifier": 0.0003600000054575503, + "operand_upcaster": 1.700000029813964e-05, + "post-par-pipe-begin": 3.000000106112566e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005520000122487545, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.199999966658652e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 4.70000013592653e-05, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 107.69713592529297, + "HloMacCount": 19483983872.0, + "Traffic": 361829184.0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.0069692134857177734, + "DMALocalityOpt": 0.006772279739379883, + "DMAProfiler": 0.008215665817260742, + "DataStreaming": 0.012622594833374023, + "DoNothing": 0.004723310470581055, + "ExpandISAMacro": 0.007757902145385742, + "FactorizeBlkDims": 0.030848026275634766, + "InferPSumTensor": 0.023444652557373047, + "InferSharedMemLoc": 0.010675668716430664, + "InsertCoreBarrier": 0.007489681243896484, + "LateLegalizeInst": 0.01503753662109375, + "LateNeuronInstComb": 0.017124652862548828, + "LegalizeSundaAccess": 0.028142213821411133, + "LegalizeType": 0.02222132682800293, + "LowerBroadcast": 0.006150484085083008, + "LowerIntrinsics": 0.00729680061340332, + "LowerTranspose": 0.006754398345947266, + "NeuronInstComb": 0.016539335250854492, + "NeuronLICM": 0.024366140365600586, + "NeuronSimplifyPredicates": 0.006876230239868164, + "NeuronValueNumbering": 0.007918596267700195, + "SFKVectorizer": 0.11957359313964844, + "SimpleAllReduceTiling": 0.016579627990722656, + "SimplifyNeuronTensor": 0.10249876976013184, + "SpillPSum": 0.03416609764099121, + "WeightCoalescing": 0.009296655654907227 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk3/graph.neff b/context_encoding_model/_tp0_bk3/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..2892be0add6c6c5e79e96e6df95caf2d4133a007 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdea9302d0f9d0785d148992ac29a3b377a867a1a9ce89c40e3ccad020e4ef73 +size 1506304 diff --git a/context_encoding_model/_tp0_bk3/log-neuron-cc.txt b/context_encoding_model/_tp0_bk3/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e115389da188a16b3541779d6f7d3a997f3dbd9 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/log-neuron-cc.txt @@ -0,0 +1,9555 @@ +2025-11-04T21:38:32Z INFO 8576 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:32Z INFO 8576 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:32Z INFO 8594 [root]: XLA detected +2025-11-04T21:38:32Z INFO 8594 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:32Z INFO 8594 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3 +2025-11-04T21:38:32Z INFO 8594 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:32Z INFO 8594 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:32Z INFO 8594 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:32Z INFO 8594 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:32Z INFO 8594 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:32Z INFO 8594 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:32Z INFO 8594 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:33Z INFO 8594 [job.HLOToTensorizer.0]: +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 8312 + reshape 1912 23.00% ################################################################ + broadcast 1123 13.51% ##################################### + transpose 1072 12.90% ################################### + convert 945 11.37% ############################### + constant 636 7.65% ##################### + parameter 371 4.46% ############ + slice 347 4.17% ########### + add 284 3.42% ######### + get-tuple-element 259 3.12% ######## + multiply 255 3.07% ######## + dot 198 2.38% ###### + call 174 2.09% ##### + compare 173 2.08% ##### + select 170 2.05% ##### + concatenate 116 1.40% ### + tuple 57 0.69% # + scatter 57 0.69% # + negate 56 0.67% # + all-reduce 56 0.67% # + divide 29 0.35% + gather 6 0.07% + iota 5 0.06% + all-gather 3 0.04% + reduce 3 0.04% + custom-call 2 0.02% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 5437 + reshape 1421 26.14% ################################################################ + transpose 817 15.03% #################################### + convert 720 13.24% ################################ + constant 443 8.15% ################### + parameter 371 6.82% ################ + broadcast 266 4.89% ########### + dot 197 3.62% ######## + custom-call 175 3.22% ####### + multiply 171 3.15% ####### + add 171 3.15% ####### + get-tuple-element 147 2.70% ###### + slice 115 2.12% ##### + concatenate 114 2.10% ##### + compare 59 1.09% ## + select 58 1.07% ## + scatter 57 1.05% ## + negate 56 1.03% ## + all-reduce 56 1.03% ## + gather 6 0.11% + all-gather 3 0.06% + iota 3 0.06% + reduce 3 0.06% + pad 2 0.04% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +Potential split-points stats: #CC 59 #AR 56 #AG 3 #BN 0 nClamp 0 +ModuleSplitter initial partitioning... #parts 59 +ModuleSplitter initial partitioning... Done. + 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 57 58 +New disjoint wave: start 2 len 54 NumReps: 27 macs 724775731200 +First non-zero-mac/used part from the end is 58 +Not enough zero-mac parts. skip +ModuleSplitter initial partitioning... #parts 29 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:33Z INFO 8594 [job.HLOToTensorizer.0]: IR signature: 86247b71fdb68182914f06dcd53871dafb9196589a2268ca003589535514de57 for sg0000/HLOToTensorizer +2025-11-04T21:38:33Z INFO 8594 [job.HLOToTensorizer.0]: IR signature: d06bf201ec237c2793e6f9f6befbb43fe986d2b05cbf3fd38077014348d4b362 for sg0001/HLOToTensorizer +2025-11-04T21:38:33Z INFO 8594 [job.HLOToTensorizer.0]: IR signature: 44e4d964e525fd3b8d5dfaf970931f8b9d7fa6a97ad605e177799846a0eca67f for sg0002/HLOToTensorizer +2025-11-04T21:38:33Z INFO 8594 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:33Z INFO 8594 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:33Z INFO 8594 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:33Z INFO 8594 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:33Z INFO 8594 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:33Z INFO 8594 [job.Frontend.0]: Start model loading +2025-11-04T21:38:33Z INFO 8594 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:33Z INFO 8594 [job.Frontend.0]: Num jobs: 12 +2025-11-04T21:38:33Z USER 8594 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:33Z INFO 8594 [Tensorizer]: Max workers: 3 +2025-11-04T21:38:33Z INFO 8680 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-11-04T21:38:33Z INFO 8682 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-11-04T21:38:33Z INFO 8681 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-11-04T21:38:33Z INFO 8680 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8680 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8681 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8681 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:33Z INFO 8680 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8681 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.012 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.022 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.044 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.040 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.009 seconds +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:33Z INFO 8680 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.017 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.047 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8682 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.010 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:33Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.006 seconds +2025-11-04T21:38:33Z INFO 8682 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.013 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.030 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.056 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.011 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.053 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.037 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.026 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.123 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.130 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.032 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.007 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.038 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.028 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.036 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.025 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.035 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.023 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.011 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.016 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:34Z INFO 8680 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.052 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.008 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.031 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:34Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:34Z INFO 8682 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.046 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: LICM finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.050 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.011 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.088 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.112 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.060 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.117 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.024 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8681 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [Tensorizer]: After optimization: 39 statements +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-162 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6459 | hlo_id: 108 | , id = 162 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-178 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6596 | hlo_id: 117 | , id = 178 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.029 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:35Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.023 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8680 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.028 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.013 seconds +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:35Z INFO 8682 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.102 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.154 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.025 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.012 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: LICM finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.033 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.233 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.022 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.057 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.025 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.024 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LICM]: LICM finished after 0.012 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.089 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.015 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.022 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.162 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.044 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.022 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.132 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.015 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.024 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.022 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.125 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.280 seconds +2025-11-04T21:38:36Z INFO 8682 [sg0002/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.027 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.237 seconds +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.071 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8681 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8680 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.059 seconds +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=4194304 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 1024) %'all_gather.1' = AllGatherOp-34 AllGather_add(bfloat16 (1024, 1024) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 1024), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.47 | hlo_id: 15 | , id = 34 +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.011 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.025 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 36 +total number of sharded dags: 13 + +total bytes transferred from input, output, non local tensors: 370225954 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 345041680 +% bytes transferred with 2x bandwidths: 93.20 + +NC0 FLOPs: 55340232214854943330 +NC1 FLOPs: 55340232214854936160 +% FLOPs sharded: 100.00 + + +Shard dim: 1024, Number of dags: 7 +Matmuls sharded with this dim: +[1024(s),2,6,2,128] @ [2,6,2,128,8,2,128] = [1024(s),8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[1024(s),2,8,128] @ [2,8,128,2,6,2,128] = [1024(s),2,6,2,128] Number of occurrences: 2 + + +Shard dim: 256, Number of dags: 5 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[2,8,128] @ [2,8,128,75968(s)] = [75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.015 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.460 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LICM]: LICM finished after 0.017 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.336 seconds +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.030 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.404 seconds +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.011 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.034 seconds +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.031 seconds +2025-11-04T21:38:37Z INFO 8681 [sg0001/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 600 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG85'), (25, 'AG82'), (19, 'AG84'), (24, 'AG83')] +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(3, 'AG94'), (23, 'AG93'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.058 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.023 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.011 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.113 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.019 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.172 seconds +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.128 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.363 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:37Z INFO 8680 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.068 seconds +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:37Z INFO 8682 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.018 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.054 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 48: simd128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: reduce512x1x1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: reduce512x1x1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.029 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.311 seconds +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.021 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 32 +total number of sharded dags: 25 + +total bytes transferred from input, output, non local tensors: 84943876 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 51384320 +% bytes transferred with 2x bandwidths: 60.49 + +NC0 FLOPs: 92233720359980498947 +NC1 FLOPs: 92233720359980498944 +% FLOPs sharded: 100.00 + + +Shard dim: 1024, Number of dags: 24 +Matmuls sharded with this dim: +[1024(s),2,6,2,128] @ [2,6,2,128,8,2,128] = [1024(s),8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[1024(s),2,8,128] @ [2,8,128,2,2,2,2,64] = [1024(s),2,2,2,2,64] Number of occurrences: 1 +[1024(s),2,8,128] @ [2,8,128,2,6,2,128] = [1024(s),2,6,2,128] Number of occurrences: 2 +[1024(s),2,8,128] @ [2,8,128,4,128] = [1024(s),4,128] Number of occurrences: 1 +[1024(s),2,8,128] @ [2,8,128,4,2,64] = [1024(s),4,2,64] Number of occurrences: 1 + + +Shard dim: 2, Number of dags: 1 +Matmuls sharded with this dim: +[1024,4,2,128] @ [4,2,128,2(s),2,4,128] = [1024,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.055 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.058 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.021 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.021 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/LICM]: LICM finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.033 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.753 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.025 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.022 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG119'), (30, 'AG116'), (25, 'AG118'), (28, 'AG117')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 659 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 660 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG113'), (31, 'AG111'), (29, 'AG112')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77|N|(64, 2) is not sorted, index list (w/ AG ids): [(13, 'AG123'), (9, 'AG124')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 4, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG113'), (31, 'AG111'), (29, 'AG112')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input75|N|(64, 2) is not sorted, index list (w/ AG ids): [(18, 'AG128'), (14, 'AG129')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 664 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 4, 128) is not sorted, index list (w/ AG ids): [(27, 'AG113'), (31, 'AG111'), (29, 'AG112')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(20, 'AG135'), (12, 'AG137'), (17, 'AG136')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 694 of IO tensor non_local bfloat16 %reshape.68(4, 2, 2, 64, 2, 512) is not sorted, index list (w/ AG ids): [(10, 'AG130'), (15, 'AG131'), (7, 'AG115'), (26, 'AG114')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 644 of IO tensor non_local bfloat16 %reshape.73(4, 2, 2, 512, 128) is not sorted, index list (w/ AG ids): [(11, 'AG133'), (16, 'AG134'), (7, 'AG115'), (19, 'AG132')] +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.119 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.021 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.280 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: dma128x2048 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: dma128x2048 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 16: dma128x1024 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: reduce512x1x1 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: dma1x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 2: simd1x512 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.029 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.029 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.645 seconds +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.041 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.696 seconds +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.018 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.129 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.031 seconds +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.038 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.016 seconds +2025-11-04T21:38:38Z INFO 8680 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.437 seconds +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8681 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:38Z INFO 8682 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.040 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.033 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.039 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.019 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.524 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 48: simd128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: generic_store128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 32: generic_store128x128 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingBottleneck]: 16: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.101 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.022 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.019 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 32 +total number of sharded dags: 25 + +total bytes transferred from input, output, non local tensors: 40385542 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 31995904 +% bytes transferred with 2x bandwidths: 79.23 + +NC0 FLOPs: 36893488145284694019 +NC1 FLOPs: 36893488145284694016 +% FLOPs sharded: 100.00 + + +Shard dim: 1024, Number of dags: 24 +Matmuls sharded with this dim: +[1024(s),2,8,128] @ [2,8,128,2,2,2,2,64] = [1024(s),2,2,2,2,64] Number of occurrences: 1 +[1024(s),2,8,128] @ [2,8,128,4,128] = [1024(s),4,128] Number of occurrences: 1 +[1024(s),2,8,128] @ [2,8,128,4,2,64] = [1024(s),4,2,64] Number of occurrences: 1 +[64] @ [1024(s)] = [64,1024(s)] Number of occurrences: 1 + + +Shard dim: 2, Number of dags: 1 +Matmuls sharded with this dim: +[1024,4,2,128] @ [4,2,128,2(s),2,4,128] = [1024,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.021 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.015 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.099 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.103 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.071 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.027 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.015 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.043 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.063 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.638 seconds +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.021 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8681 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.057 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.011 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.024 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(28, 'AG88'), (23, 'AG90'), (26, 'AG89')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG90'), (28, 'AG88'), (26, 'AG89')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 635 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|N|(64, 2) is not sorted, index list (w/ AG ids): [(24, 'AG93'), (21, 'AG96')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 4, 2, 64) is not sorted, index list (w/ AG ids): [(23, 'AG90'), (28, 'AG88'), (26, 'AG89')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|N|(64, 2) is not sorted, index list (w/ AG ids): [(24, 'AG93'), (17, 'AG100')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 4, 128) is not sorted, index list (w/ AG ids): [(23, 'AG90'), (28, 'AG88'), (26, 'AG89')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 419 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(27, 'AG106'), (22, 'AG108'), (25, 'AG107')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 631 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 2, 512) is not sorted, index list (w/ AG ids): [(23, 'AG90'), (26, 'AG89'), (28, 'AG88'), (1, 'AG92')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 582 of IO tensor non_local bfloat16 %reshape.16(2, 2, 2, 2, 64, 2, 512) is not sorted, index list (w/ AG ids): [(7, 'AG99'), (12, 'AG98'), (16, 'AG97'), (21, 'AG96'), (24, 'AG93'), (1, 'AG92')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 676 of IO tensor non_local bfloat16 %reshape.24(4, 2, 2, 64, 2, 512) is not sorted, index list (w/ AG ids): [(8, 'AG101'), (13, 'AG102'), (17, 'AG100'), (24, 'AG93'), (1, 'AG92')] +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 614 of IO tensor non_local bfloat16 %reshape.29(4, 2, 2, 512, 128) is not sorted, index list (w/ AG ids): [(9, 'AG104'), (14, 'AG105'), (1, 'AG92'), (18, 'AG103')] +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.102 seconds +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.015 seconds +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8680 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8682 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.029 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:40Z WARNING 8681 [sg0001/Tensorizer/DataLocalityOpt]: Generated 128x1 DMA for macro: + dma128x1:free_axes={};partition_axes={i2_1_1_1516=[0:128:1]};#instances=8192 { + for (i2_1_1_1516: range(0, 128, 1)) { # indent=16 + bfloat16 $1515[i2_0_1516, i0_0_1516, i2_1_0_1516, i0_1_1516, i1_1516, i3_1516, i2_1_1_1516] = load TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2, 2, 128) %1517[i0_0_1516, i2_0_1516, i2_1_0_1516, i2_1_1_1516, i0_1_1516, i1_1516, i3_1516] # dl = tensor_op_name: _reshape.335 | hlo_id: 160 | + non_local bfloat16 (2, 2, 2, 2, 4, 128, 128) %'reshape.73'[i0_0_1516, i0_1_1516, i1_1516, i2_0_1516, i2_1_0_1516, i2_1_1_1516, i3_1516] = store bfloat16 $1515[i2_0_1516, i0_0_1516, i2_1_0_1516, i0_1_1516, i1_1516, i3_1516, i2_1_1_1516] # dl = tensor_op_name: _reshape.335 | hlo_id: 160 | , id = 644 + } + } +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.090 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.360 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 8192: dma128x1 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 768: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 48: simd128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x512 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.026 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.037 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.038 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.120 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.493 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.019 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.039 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.050 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.031 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.050 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.020 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.016 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.859 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.030 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 32: generic_store128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 32: generic_store128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 16: indirect_load128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 16: rmsnorm128x512x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 16: simd128x256 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 16: simd128x256 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingBottleneck]: 16: simd128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.030 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.028 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.022 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.064 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.066 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.022 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/LICM]: LICM finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.090 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.048 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.033 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.056 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/LICM]: LICM finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.031 seconds +2025-11-04T21:38:40Z INFO 8682 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:40Z WARNING 8680 [sg0000/Tensorizer/DataLocalityOpt]: Generated 128x1 DMA for macro: + dma128x1:free_axes={};partition_axes={i2_1_1_1626=[0:128:1]};#instances=8192 { + for (i2_1_1_1626: range(0, 128, 1)) { # indent=16 + bfloat16 $1625[i2_0_1626, i0_0_1626, i2_1_0_1626, i0_1_1626, i1_1626, i3_1626, i2_1_1_1626] = load TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2, 2, 128) %1627[i0_0_1626, i2_0_1626, i2_1_0_1626, i2_1_1_1626, i0_1_1626, i1_1626, i3_1626] # dl = tensor_op_name: _reshape.90 | hlo_id: 134 | + non_local bfloat16 (2, 2, 2, 2, 4, 128, 128) %'reshape.29'[i0_0_1626, i0_1_1626, i1_1626, i2_0_1626, i2_1_0_1626, i2_1_1_1626, i3_1626] = store bfloat16 $1625[i2_0_1626, i0_0_1626, i2_1_0_1626, i0_1_1626, i1_1626, i3_1626, i2_1_1_1626] # dl = tensor_op_name: _reshape.90 | hlo_id: 134 | , id = 614 + } + } +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.228 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 8192: dma128x1 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: matmul_128x128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: transpose_128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: generic_store128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: generic_store128x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: indirect_load128x512 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 16: rmsnorm128x512x128 +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.017 seconds +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.091 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:40Z INFO 8681 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.043 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.044 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.098 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.042 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.014 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.052 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.040 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.041 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.028 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.018 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.017 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.036 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.045 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.114 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.073 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.017 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.139 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.081 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.042 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.043 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.024 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.021 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LICM]: LICM finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.029 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.024 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.026 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.045 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.007 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.018 seconds +2025-11-04T21:38:41Z INFO 8682 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.159 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.021 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.079 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8680 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.080 seconds +2025-11-04T21:38:41Z INFO 8681 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.009 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.040 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.088 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.044 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.288 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.183 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.034 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.023 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.036 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.020 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.042 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.092 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.011 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 64.924% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'992.1586'[i31_0,4i31_1_0_0+i31_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i31_0,i0.128+512i31_1_0_0+128i31_1_0_1,i2.16,i1.128] # id=1585, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_992 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 198.539us (24.000MiB, est bw: 126.755GB/s, 8.463% of tot. time) for bfloat16<128 x 512> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 6, 128, 2, 512) %'input365_local_1070'[i16_0_1076,i15_0_0_0_1,i15_0_0_0_0,c1_1062,c2_1063,i0.128,i3.2,i1.128+128i2.2+256p_1696] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 6, 2, 2, 128) %'input365'[i15_0_0_0_1+2i15_0_0_0_0,p_1696,c1_1062,i0.128,c2_1063,i3.2,i2.2,i1.128] # id=1376, src_id=None, , instances=192 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.128, i2.2, i3.2]] -> [[i0.128];[i1.128, i2.2, i3.2]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 193.732us (300.000KiB, est bw: 1.586GB/s, 8.259% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 2, 37984) %'convert.55'[0,i31_0,i0.128+512i31_1_0_0+128i31_1_0_1] = store float32<1 x 128> TongaSB partitions[2] float32 (2, 297, 1, 128) %'dot.200.1596'[i31_0,4i31_1_0_0+i31_1_0_1,0,i0.128] # id=1594, src_id=None, , instances=600 # dl = tensor_op_name: _dot.200 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 123.036us (24.000MiB, est bw: 204.541GB/s, 5.245% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 6, 2, 2, 128, 2048) %'input366_local_1047'[i11_0,2i10_0_0_1_0+i10_0_0_1_1,i10_0_0_0,c2_1041,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input366'[i10_0_0_0,2i10_0_0_1_0+i10_0_0_1_1,i0.128,c2_1041,i1.2048] # id=1367, src_id=None, , instances=48 # dl = tensor_op_name: _dot.197 | hlo_id: 52 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 123.036us (24.000MiB, est bw: 204.541GB/s, 5.245% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 6, 2, 2, 128, 2048) %'input368_local_1058'[i16_0_1076,2i12_0_0_1_0+i12_0_0_1_1,i12_0_0_0,c2_1052,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input368'[i12_0_0_0,2i12_0_0_1_0+i12_0_0_1_1,i0.128,c2_1052,i1.2048] # id=1370, src_id=None, , instances=48 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 0.920% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %'996.1670'[i11_0,T_i1_0,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 512, 2048) %'add.9'[i11_0,i0.128+128T_i1_0,i1.2048] # id=1560, src_id=None, , instances=8 # dl = tensor_op_name: add.9_pftranspose_996 | hlo_id: 27 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 0.920% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2, 2, 512) %'_reload_1523'[i16_0_1076,i4_0_1_1526_0,i0.128,i3.2,i2.2,i1.512] = load bfloat16<128 x 2048> DRAM3DBlk partitions[2] bfloat16 (4, 2, 128, 2048) %'_spill_1520'[i4_0_1_1526_0,i16_0_1076,i0.128,i1.512+1024i2.2+512i3.2] # id=1525, src_id=None, , instances=8 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.512, i2.2, i3.2]] -> [[i0.128];[i1.512, i2.2, i3.2]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 0.920% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %'1000.1675'[T_i20_0_1008,T_i1_0,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2097152,) %'all_reduce.3-buffer-2033'[1048576T_i20_0_1008+2048i0.128+262144T_i1_0+i1.2048] # id=1569, src_id=None, , instances=8 # dl = tensor_op_name: all_reduce.3_pftranspose_1000 | hlo_id: 66 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 13.416us (4.000MiB, est bw: 312.630GB/s, 0.572% of tot. time) for bfloat16<128 x 2048> DRAM3DBlk partitions[2] bfloat16 (4, 2, 128, 2048) %'_spill_1520'[i2_0_1_1634_2011_0,i11_0,i0.128,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %1014[i11_0,i2_0_1_1634_2011_0,i0.128,i1.2048] # id=1522, src_id=None, , instances=8 # dl = tensor_op_name: _custom-call.348 | hlo_id: 34 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 13.416us (4.000MiB, est bw: 312.630GB/s, 0.572% of tot. time) for bfloat16<128 x 2048> non_local bfloat16 (2097152,) %'dot.14-buffer-2031'[1048576i16_0_1076+2048i0.128+262144i16_1_0_1076_1527+i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %1077[i16_0_1076,i16_1_0_1076_1527,i0.128,i1.2048] # id=1379, src_id=None, , instances=8 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.108 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.022 seconds +2025-11-04T21:38:42Z INFO 8682 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.046 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.018 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.023 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.023 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.011 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.022 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.017 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.035 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.037 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.016 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.043 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.045 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.035 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.045 seconds +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.018 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8680 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.017 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8682 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:42Z INFO 8681 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.077 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.033 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.030 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.029 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.031 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.066 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.024 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.040 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.041 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.083 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.050 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.027 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.024 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.027 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.037 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.060 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.017 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.037 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.010 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2630) %4(init=0.0)[i0.32,i1.2374] = load float32<32 x 2374> float32 (32, 2374) %6[i0.32,i1.2374] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2374) %10[i0.32,i1.2374] = load float32<32 x 2374> float32 (1, 75968) %'inp'[i0.32,i1.2374] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 9.509% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.014 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.011 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.068 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.017 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.098 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.021 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.032 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.323 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.030 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.069 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.022 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.040 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.110 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.031 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.031 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 198.539us (24.000MiB, est bw: 126.755GB/s, 24.815% of tot. time) for bfloat16<128 x 512> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 6, 128, 2, 512) %'input68_local_1426'[i16_0_1432,i15_0_0_0_1,i15_0_0_0_0,c1_1418,c2_1419,i0.128,i3.2,i1.128+128i2.2+256p_1943] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 6, 2, 2, 128) %'input68'[i15_0_0_0_1+2i15_0_0_0_0,p_1943,c1_1418,i0.128,c2_1419,i3.2,i2.2,i1.128] # id=1665, src_id=None, , instances=192 # dl = tensor_op_name: _dot.6 | hlo_id: 51 | [[i0.128];[i1.128, i2.2, i3.2]] -> [[i0.128];[i1.128, i2.2, i3.2]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 123.036us (24.000MiB, est bw: 204.541GB/s, 15.378% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 6, 2, 2, 128, 2048) %'input69_local_1403'[i11_0,2i10_0_0_1_0+i10_0_0_1_1,i10_0_0_0,c2_1397,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input69'[i10_0_0_0,2i10_0_0_1_0+i10_0_0_1_1,i0.128,c2_1397,i1.2048] # id=1656, src_id=None, , instances=48 # dl = tensor_op_name: _dot.4 | hlo_id: 40 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 123.036us (24.000MiB, est bw: 204.541GB/s, 15.378% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 6, 2, 2, 128, 2048) %'input71_local_1414'[i16_0_1432,2i12_0_0_1_0+i12_0_0_1_1,i12_0_0_0,c2_1408,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input71'[i12_0_0_0,2i12_0_0_1_0+i12_0_0_1_1,i0.128,c2_1408,i1.2048] # id=1659, src_id=None, , instances=48 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 5.234% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'input78_local_1450'[i37_0,i38_0_0,c1_1442,c2_1443,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input78'[i38_0_0,c1_1442,i0.128,i1.2048+2048c2_1443] # id=1679, src_id=None, , instances=16 # dl = tensor_op_name: _dot.9 | hlo_id: 71 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 2.698% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %'1350.1909'[i11_0,T_i1_0,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 512, 2048) %'add.4'[i11_0,i0.128+128T_i1_0,i1.2048] # id=1780, src_id=None, , instances=8 # dl = tensor_op_name: add.4_pftranspose_1350 | hlo_id: 15 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 2.698% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2048) %'_reload_1775'[i16_0_1432,i4_0_0_711_1778,i4_0_1_1778_0,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[3] bfloat16 (2, 2, 2, 128, 2048) %'_spill_1772'[i4_0_0_711_1778,i4_0_1_1778_0,i16_0_1432,i0.128,i1.2048] # id=1777, src_id=None, , instances=8 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 2.698% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 4, 128, 2048) %'1354.1914'[i37_0,T_i1_0,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2097152,) %'all_reduce.1-buffer-2416'[1048576i37_0+2048i0.128+262144T_i1_0+i1.2048] # id=1789, src_id=None, , instances=8 # dl = tensor_op_name: all_reduce.1_pftranspose_1354 | hlo_id: 54 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 2.698% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2048) %'input76_local_1471'[i67_0,c0_1464,c1_1465,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input76'[c0_1464,i0.128,i1.2048+2048c1_1465] # id=1702, src_id=None, , instances=8 # dl = tensor_op_name: _dot.8 | hlo_id: 114 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 2.698% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2048) %'input73_local_1510'[i2_0_1516,c0_1503,c1_1504,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input73'[c0_1503,i0.128,i1.2048+2048c1_1504] # id=1725, src_id=None, , instances=8 # dl = tensor_op_name: _dot.7 | hlo_id: 155 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 2.698% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2, 8, 128) %'get_tuple_element.2_local_1524'[i98_0_0_0_1541,c0_1518_0,c0_1518_1,i0.128,i3.2,i2.8,i1.128] = load bfloat16<128 x 2048> non_local bfloat16 (4, 2, 128, 8, 128) %'get_tuple_element.2'[2c0_1518_0+c0_1518_1,i3.2,i0.128,i2.8,i1.128] # id=1731, src_id=None, , instances=8 # dl = tensor_op_name: _dot.10 | hlo_id: 173 | [[i0.128];[i1.128, i2.8, i3.2]] -> [[i0.128];[i1.128, i2.8, i3.2]] +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.045 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.076 seconds +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:43Z INFO 8680 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8680 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8680 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:44Z INFO 8680 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.017 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.010 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.017 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.034 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8681 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 35676) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.427 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.058 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.058 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.022 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.042 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.024 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.023 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.010 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.028 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.047 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.102 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.013 seconds +2025-11-04T21:38:44Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8681 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.124 seconds +2025-11-04T21:38:45Z INFO 8681 [Tensorizer]: BirCodeGen estimate #instances=2582 in sg0001 +2025-11-04T21:38:45Z INFO 8681 [Tensorizer]: IR signature: 40b8410f3e3a61bf7fd45b7e0e94e207d7397ffe04416c5629eaa4110af8acc8 for nc00/sg0001/TensorizerBIR +2025-11-04T21:38:45Z INFO 8681 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.120 seconds +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:45Z INFO 8681 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.015 seconds +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.021 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8681 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.046 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.010 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:45Z INFO 8681 [Tensorizer]: BirCodeGen estimate #instances=2582 in sg0001 +2025-11-04T21:38:45Z INFO 8681 [Tensorizer]: IR signature: 6fff65cb59e68a9aca9ef846bc6e9ebfb21128d363ecc5824bb9a3f8b2f8bc22 for nc01/sg0001/TensorizerBIR +2025-11-04T21:38:45Z INFO 8681 [Tensorizer]: Weights total number of bytes: 196610 +2025-11-04T21:38:45Z INFO 8681 [Tensorizer]: Successfully built model. +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 272) %4(init=0.0)[i0.32,i1.16] = load float32<32 x 16> float32 (32, 16) %6[i0.32,i1.16] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 16) %10[i0.32,i1.16] = load float32<32 x 16> float32 (1, 512) %'inp'[i0.32,i1.16] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 12.028% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.011 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.206 seconds +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8680 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.459 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.021 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.025 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.013 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 12.020% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'input67_local_1588'[i34_0,i35_0_0,c1_1580,c2_1581,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input67'[i35_0_0,c1_1580,i0.128,i1.2048+2048c2_1581] # id=1791, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2 | hlo_id: 32 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 34.173us (4.000MiB, est bw: 122.737GB/s, 9.809% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 512) %'1499.2155'[T_i4,T_i0,T_i1,2T_i2_0+T_i2_1,i0.128,i1.512] = load bfloat16<128 x 512> non_local bfloat16 (2, 2, 4, 128, 2, 512) %'all_gather.1'[T_i0,T_i1,2T_i2_0+T_i2_1,i0.128,T_i4,i1.512] # id=2057, src_id=None, , instances=32 # dl = tensor_op_name: all_gather.1_pftranspose_1499 | hlo_id: 15 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 34.173us (4.000MiB, est bw: 122.737GB/s, 9.809% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 512) %'custom-call.177.2131'[i34_0,i16_0_0_1569,i16_0_1_0_1569,i16_0_1_1_1569,i0.128,i1.512] = load bfloat16<128 x 512> non_local bfloat16 (2, 2, 4, 128, 2, 512) %'all_gather.1'[i16_0_0_1569,i16_0_1_0_1569,i16_0_1_1_1569,i0.128,i34_0,i1.512] # id=1786, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.177 | hlo_id: 24 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 6.197% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2048) %'input65_local_1604'[i64_0,c0_1597,c1_1598,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input65'[c0_1597,i0.128,i1.2048+2048c1_1598] # id=1838, src_id=None, , instances=8 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 6.197% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2048) %'input62_local_1620'[i2_0_1626,c0_1613,c1_1614,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 128, 4096) %'input62'[c0_1613,i0.128,i1.2048+2048c1_1614] # id=1885, src_id=None, , instances=8 # dl = tensor_op_name: _dot | hlo_id: 129 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 6.197% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2, 8, 128) %'get_tuple_element.1_local_1634'[i95_0_0_0_1651,c0_1628_0,c0_1628_1,i0.128,i3.2,i2.8,i1.128] = load bfloat16<128 x 2048> non_local bfloat16 (4, 2, 128, 8, 128) %'get_tuple_element.1'[2c0_1628_0+c0_1628_1,i3.2,i0.128,i2.8,i1.128] # id=1891, src_id=None, , instances=8 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.128, i2.8, i3.2]] -> [[i0.128];[i1.128, i2.8, i3.2]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 21.589us (4.000MiB, est bw: 194.277GB/s, 6.197% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2048) %'input61_local_1645'[i95_0_0_0_1651,i95_0_0_1,c2_1639_0_2709,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input61'[i95_0_0_0_1651,i95_0_0_1,i0.128,i1.2048+2048c2_1639_0_2709] # id=1892, src_id=None, , instances=8 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 17.737us (2.000MiB, est bw: 118.239GB/s, 5.091% of tot. time) for bfloat16<128 x 512> TongaSB partitions[2] bfloat16 (2, 8, 128, 512) %'transpose.1_pftranspose_1494'[T_i2_0_1498,c0_1533_1930,i0.128,i1.512] = indirect_load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (151936, 2, 512) %'input60'[i0.128,T_i2_0_1498,i1.512] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[1] int32 (2, 128, 8, 1) %'gather.41.1928'[T_i2_0_1498,i0.128,c0_1533_1930,0] # id=1746, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=16 # dl = tensor_op_name: _gather.41 | hlo_id: 12 | [[i0.128];[i1.512]] -> [[i0.128];[i1.512]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 15.912us (4.000MiB, est bw: 263.593GB/s, 4.567% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (2097152,) %'dot.4-buffer-2752'[1024i95_0_0_0_1651+2048i0.128+262144i96_0_1651+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[2] bfloat16 (2, 8, 128, 1024) %1652[i95_0_0_0_1651,i96_0_1651,i0.128,i1.1024] # id=1895, src_id=None, , instances=16 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 13.768us (1.000MiB, est bw: 76.160GB/s, 3.952% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output2'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[4] int32 (2, 2, 2, 4, 128, 1) %'scatter.6719.2318'[i111_0,i105_0,i105_1,i104_1_0,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[2] bfloat16 (2, 2, 128, 4, 2, 128) %'transpose.19'[i111_0,i105_0,i0.128,i104_1_0,i105_1,i1.128] # id=1909, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=32 # dl = tensor_op_name: _scatter.6719 | hlo_id: 187 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.015 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 35676) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.332 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.051 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.051 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 4.392 seconds +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.040 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.062 seconds +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.063 seconds +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:46Z WARNING 8682 [sg0002/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 51.13 percent of all matmul computation +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.013 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.021 seconds +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.037 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.038 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:46Z INFO 8682 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8680 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.097 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:47Z INFO 8680 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.019 seconds +2025-11-04T21:38:47Z INFO 8680 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.020 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.040 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.024 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.054 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8680 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.055 seconds +2025-11-04T21:38:47Z INFO 8680 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.174 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.076 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:47Z INFO 8680 [Tensorizer]: BirCodeGen estimate #instances=1305 in sg0000 +2025-11-04T21:38:47Z INFO 8680 [Tensorizer]: IR signature: 98daca180c0ec3f47dd29ac8a5821c14c620605cb9f684d9efa077642378433a for nc00/sg0000/TensorizerBIR +2025-11-04T21:38:47Z INFO 8680 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.028 seconds +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:47Z INFO 8680 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8680 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.072 seconds +2025-11-04T21:38:47Z INFO 8680 [Tensorizer]: BirCodeGen estimate #instances=1305 in sg0000 +2025-11-04T21:38:47Z INFO 8680 [Tensorizer]: IR signature: 2f3cd749ef44ac56048698881c3043500020f4b3a8872c0592b618a21b3a290d for nc01/sg0000/TensorizerBIR +2025-11-04T21:38:47Z INFO 8680 [Tensorizer]: Weights total number of bytes: 229634 +2025-11-04T21:38:47Z INFO 8680 [Tensorizer]: Successfully built model. +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.310 seconds +2025-11-04T21:38:47Z INFO 8682 [Tensorizer]: BirCodeGen estimate #instances=26049 in sg0002 +2025-11-04T21:38:47Z INFO 8682 [Tensorizer]: IR signature: d02a6a5b8b788c8805aa29f851a6db72e11bae39ebe34ad3abe1a75d0126c11d for nc00/sg0002/TensorizerBIR +2025-11-04T21:38:47Z INFO 8682 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:48Z INFO 8682 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8682 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.517 seconds +2025-11-04T21:38:48Z INFO 8682 [Tensorizer]: BirCodeGen estimate #instances=26049 in sg0002 +2025-11-04T21:38:48Z INFO 8682 [Tensorizer]: IR signature: cabc1abb7e92515618d6f8337386e63f4deff5b3549d4d74e54363188aae28da for nc01/sg0002/TensorizerBIR +2025-11-04T21:38:48Z INFO 8682 [Tensorizer]: Weights total number of bytes: 410376 +2025-11-04T21:38:48Z INFO 8682 [Tensorizer]: Successfully built model. +2025-11-04T21:38:48Z USER 8594 [root/Tensorizer/Tensorizer]: Tensorizer finished after 15.210 seconds +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: End tensorization +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input60 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input0 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input63 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input67 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input66 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input65 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input64 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input62 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input61 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input4 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input5 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input70 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input71 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input69 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input68 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input74 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input78 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input77 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input76 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input75 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input73 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input72 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input6 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input7 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input367 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input368 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input366 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input365 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input370 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input369 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Network input: input3 +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:48Z INFO 8594 [job.Frontend.0]: Job #0 finished +2025-11-04T21:38:48Z INFO 8594 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:38:48Z INFO 8594 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:38:48Z INFO 8594 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:38:48Z INFO 8594 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: BackendDriver has 6 states with 2 core LNC +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: BackendDriver: found partitions within VNC, using VNC + MT modular flow. +2025-11-04T21:38:48Z INFO 8594 [job.BIRLinker.1]: Creating directory nc00/sgLnk/sg00 +2025-11-04T21:38:48Z INFO 8594 [job.BIRLinker.2]: Creating directory nc01/sgLnk/sg00 +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: BackendDriver in_state.num_states 6 with 2 core LNC +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs nc00/sg00,nc01/sg00,nc00/sg01,nc01/sg01,nc00/sg02,nc01/sg02 --link-dir sgLnk/sg00 --vnc-nc-per-sengine 2 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels scalar_dynamic_offset,vector_dynamic_offsets,spill_reload,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:38:48Z INFO 8594 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:38:48Z INFO 9044 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:38:48Z INFO 9044 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:38:48Z INFO 9044 [BackendDriver]: Loading module from nc01/sg01/bir.json +2025-11-04T21:38:48Z INFO 9044 [BackendDriver]: Loading module from nc00/sg02/bir.json +2025-11-04T21:38:48Z INFO 9044 [BackendDriver]: Loading module from nc01/sg02/bir.json +2025-11-04T21:38:48Z INFO 9044 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:38:48Z INFO 9044 [BackendDriver]: Loading module from nc00/sg01/bir.json +2025-11-04T21:38:48Z INFO 9044 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:38:49Z INFO 9044 [BackendDriver]: Backend driver mtBackend: true numModules: 6 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg" +2025-11-04T21:38:49Z INFO 9044 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:38:49Z INFO 9044 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:38:49Z INFO 9044 [BackendDriver]: Modular flow call graph is enabled +2025-11-04T21:38:49Z INFO 9044 [BackendDriver]: Internal partitioner is enabled +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1914 blocks=6 instructions=1776 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 92mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 210 memory location(s), 1 block(s), and 101 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 92mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.232.2350}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 591 memory location(s), 1 block(s), and 703 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 92mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 210 memory location(s), 1 block(s), and 101 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 92mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 156 memory location(s), 1 block(s), and 84 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 92mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 156 memory location(s), 1 block(s), and 84 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 92mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 591 memory location(s), 1 block(s), and 703 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.232.2350}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.022 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 210 memory location(s), 1 block(s), and 101 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.026 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 210 memory location(s), 1 block(s), and 101 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.038 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 113mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 156 memory location(s), 1 block(s), and 84 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.049 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 123mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 156 memory location(s), 1 block(s), and 84 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.159 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 183mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 591 memory location(s), 1 block(s), and 703 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.188 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 591 memory location(s), 1 block(s), and 703 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.197 seconds +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=1914 blocks=6 instructions=1776 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z USER 9044 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z USER 9044 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=312 blocks=2 instructions=168 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=420 blocks=2 instructions=202 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1182 blocks=2 instructions=1406 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 420 memory location(s), 2 block(s), and 202 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 312 memory location(s), 2 block(s), and 168 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9044 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1182 memory location(s), 2 block(s), and 1406 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1914 blocks=6 instructions=1776 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 156 memory location(s), 1 block(s), and 84 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 210 memory location(s), 1 block(s), and 101 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 591 memory location(s), 1 block(s), and 703 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 210 memory location(s), 1 block(s), and 101 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 156 memory location(s), 1 block(s), and 84 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=210 blocks=1 instructions=101 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=156 blocks=1 instructions=84 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 187mb, ru_maxrss: 213mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 591 memory location(s), 1 block(s), and 703 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=591 blocks=1 instructions=703 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:49 2025 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Total count: 1303 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Matmult: 641 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: TensorScalarPtr: 171 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: TensorTensor: 134 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: GenericCopy: 128 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Load: 69 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Activation: 55 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Save: 40 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: DMACopy: 40 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 40 +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.067 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 248mb, ru_maxrss: 248mb (delta=35mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1411 memory location(s), 1 block(s), and 1303 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=1411 blocks=1 instructions=1303 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Total count: 1305 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Matmult: 641 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: TensorScalarPtr: 171 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: TensorTensor: 134 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: GenericCopy: 128 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Load: 69 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Activation: 55 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Save: 41 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: DMACopy: 41 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Memset: 9 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 40 +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.004 seconds +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.084 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 252mb, ru_maxrss: 252mb (delta=3mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 252mb, ru_maxrss: 252mb (delta=39mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1411 memory location(s), 1 block(s), and 1305 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=1411 blocks=1 instructions=1305 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.010 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 256mb, ru_maxrss: 256mb (delta=3mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Total count: 2582 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Matmult: 1828 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Load: 198 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: TensorScalarPtr: 128 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: GenericCopy: 121 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: TensorTensor: 120 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Activation: 82 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Save: 45 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: DMACopy: 34 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 32 +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: unroll finished after 0.145 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 270mb, ru_maxrss: 270mb (delta=57mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Total count: 2580 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Matmult: 1828 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Load: 198 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: TensorScalarPtr: 128 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: GenericCopy: 121 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: TensorTensor: 120 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Activation: 82 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Save: 44 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: DMACopy: 33 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Memset: 10 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1527 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=1527 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 32 +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: unroll finished after 0.155 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 270mb (delta=57mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1527 memory location(s), 1 block(s), and 2580 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=1527 blocks=1 instructions=2580 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.006 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 270mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.016 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 261mb, ru_maxrss: 270mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Total count: 14230 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Matmult: 11306 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: GenericCopy: 1452 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Load: 490 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Save: 330 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: TensorTensor: 83 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Activation: 61 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Memset: 23 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: TensorReduce: 10 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: unroll finished after 0.467 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 373mb (delta=160mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5769 memory location(s), 1 block(s), and 14230 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5769 blocks=1 instructions=14230 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.041 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 341mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:49 2025 + +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Total count: 14219 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Matmult: 11306 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: GenericCopy: 1452 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Load: 490 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Save: 319 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: TensorTensor: 83 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Activation: 61 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: TensorScalarPtr: 53 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Memset: 23 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: TensorReduce: 10 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Select: 3 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Iota: 2 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: unroll finished after 0.532 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 341mb, ru_maxrss: 373mb (delta=160mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5769 memory location(s), 1 block(s), and 14219 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5769 blocks=1 instructions=14219 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.049 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 298mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.593 seconds +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: curr_vmrss: 298mb, ru_maxrss: 373mb (delta=160mb) +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=8521 blocks=6 instructions=35425 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:49Z USER 9044 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:49Z USER 9044 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=1307 blocks=2 instructions=2605 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=1485 blocks=2 instructions=5161 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5729 blocks=2 instructions=27659 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 298mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 298mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1307 memory location(s), 2 block(s), and 2605 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1485 memory location(s), 2 block(s), and 5161 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z USER 9044 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 298mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5729 memory location(s), 2 block(s), and 27659 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.009 seconds +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: curr_vmrss: 298mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=8521 blocks=6 instructions=35425 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.004 seconds +2025-11-04T21:38:49Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1267_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:49Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1272_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:49Z USER 9044 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.007 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 298mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 299mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.010 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 299mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z USER 9044 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.015 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 299mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z USER 9044 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.069 seconds +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 305mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.076 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.077 seconds +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=8521 blocks=6 instructions=35425 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z USER 9044 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z USER 9044 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1485 blocks=2 instructions=5161 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1307 blocks=2 instructions=2605 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=5729 blocks=2 instructions=27659 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1307 memory location(s), 2 block(s), and 2605 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1485 memory location(s), 2 block(s), and 5161 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:49Z USER 9044 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.004 seconds +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5729 memory location(s), 2 block(s), and 27659 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.006 seconds +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:49Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=8521 blocks=6 instructions=35425 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z WARNING 9044 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:49Z INFO 9044 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:49Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:49Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: instruction_reorder finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z WARNING 9044 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 480 bytes/partition +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0 seconds +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.001 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 480 bytes/partition +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.01 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: instruction_reorder finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z WARNING 9044 (nc01/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 4 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: psum_legalization finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.004 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 306mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: psum_legalization finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.002 seconds +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: vn_splitter finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z WARNING 9044 (nc00/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 4 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.001 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: vn_splitter finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Allocs: 654 instructions: 1304 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: constant_propagate finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 3264 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Done build fdeps 3264 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.022 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Allocs: 653 instructions: 1301 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=654 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=655 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: size = 150 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: found 313 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: mean: 4.17333 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: median: 4.15772 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 2504 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: lo = 150 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: total = 150 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Allocs: 742 instructions: 2579 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 3262 edges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Done build fdeps 3262 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.038 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: constant_propagate finished after 0.032 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.043 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: remat_optimization finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.031 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 15 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z WARNING 9044 (nc01/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 4 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 5 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 51 PSUM Banks +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.007 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 16841476 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2085 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 8650754 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2117632 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 653 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=653 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 654 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=654 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z WARNING 9044 (nc00/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 308mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 11 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: size = 470 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: found 75 accumulation groups +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: largest = custom-call.177.2122_i0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: size = 150 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: found 313 edges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: mean: 4.17333 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: median: 4.15772 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 2504 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: lo = 150 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: total = 150 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: 60 remat count +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Num intervals 470 Num locations 470 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Allocs: 743 instructions: 2582 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 7317 edges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Done build fdeps 7317 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 15 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: edge: 11448 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: mean: 48.7149 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: median: 39.2429 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: pre_sched finished after 0.057 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: safe = 424 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: unsafe = 43 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: inf = 1 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: total = 468 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 470 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2579 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=742 blocks=1 instructions=2579 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Total: 468 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (468) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Rover zone: 0.904 (423) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.017 (8) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.079 (37) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.002 (1) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.998 (467) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.988 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 16841476 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2085 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 8650754 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2117632 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.022 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 5 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.011 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.011 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 51 PSUM Banks +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 16841476 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2085 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 8650752 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2117632 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 741 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=741 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=742 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: size = 469 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: found 75 accumulation groups +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: largest = custom-call.177.2122_i1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: size = 168 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: found 507 edges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: mean: 6.03571 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: median: 6.98448 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: adjacency vectors require 4056 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 656 memory location(s), 1 block(s), and 1304 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=656 blocks=1 instructions=1304 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 25492230, 41.3852% input load, 9.25497% output write, 49.3598% spill/reload [sg0000] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: lo = 168 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: total = 168 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: vn_splitter finished after 0.027 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 7319 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Done build fdeps 7319 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: pre_sched finished after 0.042 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 309mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2582 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=743 blocks=1 instructions=2582 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: 60 remat count +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Num intervals 469 Num locations 469 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: edge: 11440 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: mean: 48.7846 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: median: 39.5271 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: safe = 423 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: unsafe = 43 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: inf = 1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: total = 467 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 469 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 1835008, 7.1983% out of total dma traffic(1.055e+07) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.007 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 57221636 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2292 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 11534336 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2048 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1064960 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.015 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.02 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: vn_splitter finished after 0.044 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: size = 530 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: found 142 accumulation groups +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: largest = _dot.6-t1590_i16 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Total: 467 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (467) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Rover zone: 0.906 (423) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.015 (7) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.079 (37) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.002 (1) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.998 (466) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.990 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 16841476 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2085 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 8650752 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2117632 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 206 bytes +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.026 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 2388 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 15006468 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2388 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 8650754 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 1835008, 7.1983% out of total dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 23657222, 44.5953% input load, 9.97284% output write, 45.4318% spill/reload [sg0000] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 15006468 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2388 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 8650754 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 2117632 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1188 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.024 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.018 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 742 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=742 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=743 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: size = 168 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 655 memory location(s), 1 block(s), and 1301 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=655 blocks=1 instructions=1301 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 25492228, 41.3852% input load, 9.25496% output write, 49.3598% spill/reload [sg0000] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: 93 remat count +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 1835008, 7.1983% out of total dma traffic(1.055e+07) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 310mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 641 memory location(s), 1 block(s), and 1290 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=641 blocks=1 instructions=1290 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Num intervals 530 Num locations 530 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: edge: 14181 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: mean: 53.5132 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: median: 44.0241 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: safe = 382 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: unsafe = 127 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: inf = 19 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: total = 528 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 530 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Total: 528 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Allocated: 1.000 (528) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Rover zone: 0.892 (471) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Pre-rover zone: 0.011 (6) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Post-rover zone: 0.097 (51) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Blocks nothing: 0.002 (1) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Blocks tall: 0.998 (527) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.994 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: found 507 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: mean: 6.03571 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: median: 6.98448 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: adjacency vectors require 4056 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 57221636 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2292 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 11534336 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2048 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1064960 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.025 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 743 memory location(s), 1 block(s), and 2578 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=743 blocks=1 instructions=2578 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 68755972, 71.0237% input load, 3.05014% output write, 25.9262% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(4.8833e+07) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 8 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 8 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: lo = 168 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: total = 168 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.023 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4194304, 23.5294% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 2388 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 15006468 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2388 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 8650752 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 1835008, 7.1983% out of total dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 23657220, 44.5953% input load, 9.97284% output write, 45.4318% spill/reload [sg0000] +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: constant_propagate finished after 0.051 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 15006468 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2388 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 8650752 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1689 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 2117632 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 206 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1188 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.026 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 640 memory location(s), 1 block(s), and 1287 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=640 blocks=1 instructions=1287 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: lower_ac finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 2254 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 1843 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 55124484 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2254 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 9437184 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 32 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1843 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4194304, 6.10028% out of total dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 64561668, 75.6378% input load, 3.24829% output write, 21.1139% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 55124484 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2254 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 9437184 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1843 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 1064960 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1737 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.024 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 734 memory location(s), 1 block(s), and 2570 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=734 blocks=1 instructions=2570 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 48 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 21 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: remat_optimization finished after 0.014 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.044 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.022 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 57221636 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2292 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 11534338 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2047 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1064960 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13439 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2640 blocks=1 instructions=13439 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 641 memory location(s), 1 block(s), and 1290 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=641 blocks=1 instructions=1290 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: reserved space = 164096 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 641 memory location(s), 1 block(s), and 1290 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=641 blocks=1 instructions=1290 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: size = 531 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: found 142 accumulation groups +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: largest = _dot.6-t1590_i15 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 32 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_redundant_memsets: 1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 312mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 641 memory location(s), 1 block(s), and 1290 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=641 blocks=1 instructions=1290 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 34 out of 136 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 312mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 641 memory location(s), 1 block(s), and 1290 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=641 blocks=1 instructions=1290 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 312mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 641 memory location(s), 1 block(s), and 1291 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=641 blocks=1 instructions=1291 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 1291, number of allocs: 641 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2766-0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.000117 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2766-0] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: input0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: input1: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: input2: [ 4 1024 128 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: output0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 0 +Memory Location: {reshape.16}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 0 +Memory Location: {reshape.24}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 1024 / 1024 = 1 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Scratch sbuf for kernel I-2766-0: [80128, 115804) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 21 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 16 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.049 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 312mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 640 memory location(s), 1 block(s), and 1287 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=640 blocks=1 instructions=1287 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: reserved space = 164096 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 0.03333 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.011 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1184 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1184 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: 93 remat count +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1184 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1184 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1184 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1184 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Num intervals 531 Num locations 531 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: edge: 14189 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: mean: 53.4426 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: median: 43.6911 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 28 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 640 memory location(s), 1 block(s), and 1287 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=640 blocks=1 instructions=1287 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 640 memory location(s), 1 block(s), and 1287 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=640 blocks=1 instructions=1287 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 34 out of 135 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 640 memory location(s), 1 block(s), and 1287 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=640 blocks=1 instructions=1287 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 314mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 640 memory location(s), 1 block(s), and 1288 instruction(s). Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=640 blocks=1 instructions=1288 Max writers: 16 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 1288, number of allocs: 640 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2766-0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 8.8e-05 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2766-0] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: input0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: input1: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: input2: [ 4 1024 128 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: output0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 524288 +Memory Location: {reshape.16}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 524288 +Memory Location: {reshape.24}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 1024 / 1024 = 1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Scratch sbuf for kernel I-2766-0: [80128, 115804) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 316mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1200 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=1200 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.050 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 316mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 316mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 734 memory location(s), 1 block(s), and 2570 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=734 blocks=1 instructions=2570 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1200 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1200 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: safe = 383 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: unsafe = 127 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: inf = 19 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: total = 529 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 531 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Total: 529 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Allocated: 1.000 (529) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Rover zone: 0.890 (471) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Pre-rover zone: 0.013 (7) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Post-rover zone: 0.096 (51) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Blocks nothing: 0.002 (1) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Blocks tall: 0.998 (528) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.993 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: reserved space = 131072 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 0.029906 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1183 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1183 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 57221636 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2292 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 11534338 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2047 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 1064960 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1183 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1183 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1183 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1183 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.043 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.009 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1200 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=1200 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1200 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=1200 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Allocs: 1200 instructions: 2086 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 734 memory location(s), 1 block(s), and 2570 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=734 blocks=1 instructions=2570 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: allreduce hwm 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: Real CC buffer size 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 734 memory location(s), 1 block(s), and 2570 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=734 blocks=1 instructions=2570 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyAccel::Impl]: Accelerated 18 out of 129 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 734 memory location(s), 1 block(s), and 2570 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=734 blocks=1 instructions=2570 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 734 memory location(s), 1 block(s), and 2571 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=734 blocks=1 instructions=2571 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 2571, number of allocs: 734 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: constant_propagate finished after 0.115 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 744 memory location(s), 1 block(s), and 2581 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=744 blocks=1 instructions=2581 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 68755974, 71.0237% input load, 3.05014% output write, 25.9262% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2433-0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Scan BKs time (s): 0.01566 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2433-0] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: input0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: input1: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: input2: [ 4 1024 128 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: output0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 524288 +Memory Location: {reshape.60}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 524288 +Memory Location: {reshape.68}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 1024 / 1024 = 1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Scratch sbuf for kernel I-2433-0: [65024, 100700) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1199 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=1199 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1199 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1199 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2766-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,85508>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: lower_ac finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 4668 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [build_flow_deps]: Done build fdeps 4668 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.014 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1200 memory location(s), 1 block(s), and 2086 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=1200 blocks=1 instructions=2086 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(4.8833e+07) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [LowerKernel]: Lower BKs time (s): 0.033175 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: lower_kernel finished after 0.012 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1277 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1277 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1277 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1277 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1277 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1277 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 8 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 8 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1188 memory location(s), 1 block(s), and 2058 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1188 blocks=1 instructions=2058 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4194304, 23.5294% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1293 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=1293 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 323mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1293 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1293 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.020 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.027 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 324mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1199 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=1199 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [build_flow_deps]: Allocs: 2640 instructions: 13438 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1199 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=1199 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 7Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Allocs: 1199 instructions: 2083 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 2254 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 1842 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 55124484 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2254 bytes +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.022 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 9437186 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 1842 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1188 memory location(s), 1 block(s), and 2058 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1188 blocks=1 instructions=2058 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 32 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4194304, 6.10028% out of total dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 64561670, 75.6378% input load, 3.2483% output write, 21.1139% spill/reload [sg0001] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 55124484 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2254 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 9437186 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 1842 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 1064960 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1737 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.041 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 735 memory location(s), 1 block(s), and 2573 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=735 blocks=1 instructions=2573 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 4666 edges +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.023 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2026 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1293 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1155 blocks=1 instructions=2026 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=1293 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [build_flow_deps]: Done build fdeps 4666 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.015 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1293 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=1293 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 52 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1199 memory location(s), 1 block(s), and 2083 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=1199 blocks=1 instructions=2083 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 8Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Allocs: 1293 instructions: 3366 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z USER 9044 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 325mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2026 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1187 memory location(s), 1 block(s), and 2055 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1187 blocks=1 instructions=2055 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: remat_optimization finished after 0.035 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 328mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 329mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 330mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14220 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=3089 blocks=1 instructions=14220 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:50Z INFO 9044 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.022 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 331mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1187 memory location(s), 1 block(s), and 2055 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1187 blocks=1 instructions=2055 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 8735 edges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [build_flow_deps]: Done build fdeps 8735 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: build_fdeps finished after 0.030 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 333mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1293 memory location(s), 1 block(s), and 3366 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=1293 blocks=1 instructions=3366 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 32 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: remove_redundancies finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 333mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1281 memory location(s), 1 block(s), and 3338 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1281 blocks=1 instructions=3338 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 333mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2023 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1154 blocks=1 instructions=2023 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z USER 9044 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 335mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2023 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_redundant_memsets: 5 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 16 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 29 Sb address +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.064 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 337mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 735 memory location(s), 1 block(s), and 2573 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=735 blocks=1 instructions=2573 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: reserved space = 131072 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.026 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 336mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1281 memory location(s), 1 block(s), and 3338 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1281 blocks=1 instructions=3338 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.005 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 336mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 735 memory location(s), 1 block(s), and 2573 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=735 blocks=1 instructions=2573 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 35144 edges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [build_flow_deps]: Done build fdeps 35144 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: allreduce hwm 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: Real CC buffer size 4194304 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 336mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 735 memory location(s), 1 block(s), and 2573 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=735 blocks=1 instructions=2573 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyAccel::Impl]: Accelerated 18 out of 130 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 336mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 32 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 735 memory location(s), 1 block(s), and 2573 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=735 blocks=1 instructions=2573 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 336mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 735 memory location(s), 1 block(s), and 2574 instruction(s). Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=735 blocks=1 instructions=2574 Max writers: 24 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 2574, number of allocs: 735 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2433-0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Scan BKs time (s): 0.002154 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2433-0] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: input0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: input1: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: input2: [ 4 1024 128 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: output0: [ 4 128 1024 ] +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 0 +Memory Location: {reshape.60}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[131072,4],[1024,128],[1,1024]] +Offset: 0 +Memory Location: {reshape.68}@DRAM(1048576x2)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 1024 / 1024 = 1 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Scratch sbuf for kernel I-2433-0: [65024, 100700) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: seq_len=1024, seq_len2=1024, complete_seq_len2=1024 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [LowerKernel]: Lower BKs time (s): 0.016752 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: lower_kernel finished after 0.006 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1278 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1278 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1278 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1278 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1278 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1278 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.014 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3306 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1248 blocks=1 instructions=3306 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3306 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: pre_sched finished after 0.191 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2640 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: End DCE Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.033 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 339mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1294 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=1294 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 339mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1294 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1294 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z WARNING 9044 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2433-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,70404>(128x4)#Internal DebugInfo: +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 9Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 339mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1294 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=1294 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 339mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1294 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=1294 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 10Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Allocs: 1294 instructions: 3369 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [build_flow_deps]: Allocs: 3089 instructions: 14215 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 8737 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [build_flow_deps]: Done build fdeps 8737 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: build_fdeps finished after 0.016 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1294 memory location(s), 1 block(s), and 3369 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=1294 blocks=1 instructions=3369 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.046 seconds +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: remove_redundancies finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1282 memory location(s), 1 block(s), and 3341 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1282 blocks=1 instructions=3341 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2640 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2640 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 339mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2641 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 339mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 340mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: size = 1062 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: found 1273 edges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: mean: 2.39736 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: median: 1.98918 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: adjacency vectors require 10184 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.050 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 345mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1282 memory location(s), 1 block(s), and 3341 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: lo = 988 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: total = 1062 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1282 blocks=1 instructions=3341 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.052 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 346mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 32 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.030 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 346mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3309 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1249 blocks=1 instructions=3309 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z USER 9044 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 47058 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 346mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [build_flow_deps]: Done build fdeps 47058 Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:50Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3309 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.033 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 346mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: pre_sched finished after 0.261 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 347mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3089 memory location(s), 1 block(s), and 14215 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3089 blocks=1 instructions=14215 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 63 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.111 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 346mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 199672978 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3422 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 6444544 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3510 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.069 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3026 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=3026 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=3027 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: size = 1540 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: found 1057 accumulation groups +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: largest = _dot.199-t1193_i18 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: size = 1186 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: 371 remat count +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: found 1335 edges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: mean: 2.25126 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: median: 1.82863 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: adjacency vectors require 10680 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Num intervals 1540 Num locations 1540 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: edge: 16008 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: mean: 20.7896 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: median: 14.7752 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: lo = 1112 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: total = 1186 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: safe = 1415 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: unsafe = 106 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: inf = 17 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: total = 1538 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1540 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.116 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 349mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Total: 1538 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Allocated: 1.000 (1538) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Rover zone: 0.964 (1482) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Pre-rover zone: 0.012 (19) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Post-rover zone: 0.024 (37) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Blocks nothing: 0.015 (23) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Blocks medium: 0.001 (2) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.716 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.714 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.714 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Blocks tall: 0.984 (1513) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.789 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.998 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: Success +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.037 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 346mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 199672978 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3422 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 6444544 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3510 bytes +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.175 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 342mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 62 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.049 seconds +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 344mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2642 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2642 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 206117522, 93.8194% input load, 0% output write, 6.18055% spill/reload [sg0002] +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.084 seconds +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 345mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 200308382 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3398 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 6459915 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2894 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: size = 1792 +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: found 1181 accumulation groups +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: largest = _dot.199-t1193_i3 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.93378e+08) +2025-11-04T21:38:50Z INFO 9044 []: find first defs for local +2025-11-04T21:38:50Z INFO 9044 []: find first defs for global +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: 381 remat count +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Num intervals 1792 Num locations 1792 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: edge: 17592 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: mean: 19.6339 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: median: 13.0344 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:50Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: safe = 1665 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: unsafe = 108 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: inf = 17 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: total = 1790 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1792 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Total: 1790 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Allocated: 1.000 (1790) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Rover zone: 0.938 (1679) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Pre-rover zone: 0.035 (62) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Post-rover zone: 0.025 (45) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Slice zone: 0.002 (4) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Blocks nothing: 0.063 (113) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Blocks medium: 0.007 (12) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.588 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.612 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.842 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Blocks tall: 0.930 (1665) +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.709 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.975 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:50Z INFO 9044 (nc00/sg02) [SB_Allocator]: Success +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: average loaded DMA size 3422 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: average saved DMA size 3510 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 199672978 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3422 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 6444544 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3510 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 206117522, 93.8194% input load, 0% output write, 6.18055% spill/reload [sg0002] +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 199672978 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3422 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 6444544 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3510 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3423 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.242 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 351mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2641 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 197 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 200308382 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3398 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 6459915 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2894 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.291 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 350mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.021 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 347mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3028 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=3028 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 206768297, 93.6771% input load, 1.93453e-06% output write, 6.32293% spill/reload [sg0002] +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.93694e+08) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 17 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: average loaded DMA size 3398 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: average saved DMA size 2894 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 200308382 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3398 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 6459915 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2894 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 206768297, 93.6771% input load, 1.93453e-06% output write, 6.32293% spill/reload [sg0002] +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 200308382 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3398 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 6459915 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2894 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3378 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.180 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 349mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3027 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 72 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 6 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.343 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 348mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2641 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: spill space = 2097152 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 2097152 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: size = 4 +2025-11-04T21:38:51Z INFO 9044 []: find first defs for local +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 210 Sb address +2025-11-04T21:38:51Z INFO 9044 []: find first defs for global +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: Num intervals 4 Num locations 4 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: lo = 4 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: total = 4 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 2097152 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.060 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 353mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2641 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 2097152 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: allreduce hwm 4194304 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: Real CC buffer size 4194304 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 2097152 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.019 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 351mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2641 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [TensorCopyAccel::Impl]: Accelerated 601 out of 1262 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.007 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 353mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13438 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2641 blocks=1 instructions=13438 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: peephole_opts finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 352mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 13441, number of allocs: 2641 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [LowerKernel]: Scan BKs time (s): 0.001417 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 352mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 352mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 352mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.012 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 353mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 352mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1267_i1}@SB<32,16384>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:51Z WARNING 9044 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1272_i1}@SB<96,17536>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.032 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 353mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.022 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 353mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 11Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [build_flow_deps]: Allocs: 2641 instructions: 13441 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 46 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 35148 edges +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [build_flow_deps]: Done build fdeps 35148 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: build_fdeps finished after 0.076 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: remove_redundancies finished after 0.010 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 356mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 164 Sb address +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.397 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 368mb, ru_maxrss: 373mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=3027 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: reserved space = 34824 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: spill space = 2104324 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 2125824 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: size = 11 +2025-11-04T21:38:51Z INFO 9044 []: find first defs for local +2025-11-04T21:38:51Z INFO 9044 []: find first defs for global +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: Num intervals 11 Num locations 11 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: lo = 11 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: total = 11 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 2097152 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.091 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 377mb (delta=4mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=3027 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 2097152 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.134 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 377mb (delta=4mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: allreduce hwm 4194304 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: Real CC buffer size 4194304 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 2097152 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.055 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 365mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=3027 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.030 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 365mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2641 blocks=1 instructions=13441 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [TensorCopyAccel::Impl]: Accelerated 601 out of 1401 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.014 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 365mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14152 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=3027 blocks=1 instructions=14152 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:51Z USER 9044 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.012 seconds +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2641 memory location(s), 1 block(s), and 13441 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: peephole_opts finished after 0.017 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 364mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 14155, number of allocs: 3027 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [LowerKernel]: Scan BKs time (s): 0.001539 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 362mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 362mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 362mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9044 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.046 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.049 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 12Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [build_flow_deps]: Allocs: 3027 instructions: 14155 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 46999 edges +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [build_flow_deps]: Done build fdeps 46999 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: build_fdeps finished after 0.111 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: remove_redundancies finished after 0.015 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 377mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.116 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 383mb (delta=6mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.038 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=3027 blocks=1 instructions=14155 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.013 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3027 memory location(s), 1 block(s), and 14155 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 2.366 seconds +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=10mb) +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=10474 blocks=6 instructions=38260 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=2309 blocks=2 instructions=4049 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=2497 blocks=2 instructions=6615 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5668 blocks=2 instructions=27596 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2309 memory location(s), 2 block(s), and 4049 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=2309 blocks=2 instructions=4049 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5668 memory location(s), 2 block(s), and 27596 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2497 memory location(s), 2 block(s), and 6615 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=5668 blocks=2 instructions=27596 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=2497 blocks=2 instructions=6615 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2309 memory location(s), 2 block(s), and 4053 instruction(s). Max writers: 33 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=2309 blocks=2 instructions=4053 Max writers: 33 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2497 memory location(s), 2 block(s), and 6619 instruction(s). Max writers: 33 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=2497 blocks=2 instructions=6619 Max writers: 33 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2309 memory location(s), 2 block(s), and 4057 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.017 seconds +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5674 memory location(s), 2 block(s), and 27614 instruction(s). Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=5674 blocks=2 instructions=27614 Max writers: 298 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.022 seconds +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2497 memory location(s), 2 block(s), and 6623 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.084 seconds +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5674 memory location(s), 2 block(s), and 27618 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.108 seconds +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: curr_vmrss: 372mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38298 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: reserved space = 131328 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: spill space = 23068672 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 23068672 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.003 seconds +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2644 blocks=1 instructions=13452 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: reserved space = 131328 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: spill space = 23068672 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 23068672 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: size = 8 +2025-11-04T21:38:52Z INFO 9044 []: find first defs for local +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: reserved space = 98304 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: spill space = 29360128 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 29360128 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: size = 9 +2025-11-04T21:38:52Z INFO 9044 []: find first defs for local +2025-11-04T21:38:52Z INFO 9044 []: find first defs for global +2025-11-04T21:38:52Z INFO 9044 []: find first defs for global +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: reserved space = 98304 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: spill space = 29360128 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 29360128 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.011 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: lo = 8 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: total = 8 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 14680064 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 14680064 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: reserved space = 2129920 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: spill space = 17095682 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 17141760 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 23068672 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.025 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: Num intervals 9 Num locations 9 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z USER 9044 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.021 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13452 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: lo = 9 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: total = 9 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 16777216 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 16777216 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: reserved space = 2139148 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: spill space = 17095682 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 17141760 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: size = 19 +2025-11-04T21:38:52Z INFO 9044 []: find first defs for local +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 27262976 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.042 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 []: find first defs for global +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: Num intervals 19 Num locations 19 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: lo = 19 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: total = 19 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 2097152 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 2097152 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 10502144 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 10502144 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 15011840 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.094 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 375mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.097 seconds +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: curr_vmrss: 371mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38298 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=2309 blocks=2 instructions=4057 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=2497 blocks=2 instructions=6623 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (sg01) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 371mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=5674 blocks=2 instructions=27618 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2497 memory location(s), 2 block(s), and 6623 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (sg02) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5674 memory location(s), 2 block(s), and 27618 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.007 seconds +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2309 memory location(s), 2 block(s), and 4057 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.009 seconds +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38298 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2644 blocks=1 instructions=13452 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.014 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13452 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.007 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.026 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.028 seconds +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38298 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9044 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=5046 blocks=3 instructions=18789 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:52Z INFO 9044 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=5434 blocks=3 instructions=19509 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.081 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 5434 memory location(s), 3 block(s), and 19509 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.144 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 5046 memory location(s), 3 block(s), and 18789 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: nc_parallel_pass finished after 0.149 seconds +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:52Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38298 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2644 blocks=1 instructions=13452 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 370mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13452 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2644 blocks=1 instructions=13452 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 371mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 371mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 371mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 371mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware simulation time: 29678805 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: post_sched finished after 0.067 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 376mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware simulation time: 698745 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: post_sched finished after 0.073 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware simulation time: 685465 +2025-11-04T21:38:52Z USER 9044 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.006 seconds +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: post_sched finished after 0.087 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z USER 9044 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 374mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware simulation time: 29334582 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: post_sched finished after 0.127 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 376mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z USER 9044 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 383mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:52Z INFO 9044 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:53Z INFO 9044 [post_scheduler]: Time-aware simulation time: 1747874 +2025-11-04T21:38:53Z INFO 9044 [post_scheduler]: Time-aware simulation time: 1589382 +2025-11-04T21:38:53Z INFO 9044 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9044 (nc01/sg02) [ModuleForkPass]: post_sched finished after 0.637 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 413mb, ru_maxrss: 413mb (delta=30mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13452 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2644 blocks=1 instructions=13452 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc01/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13452 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2644 blocks=1 instructions=13452 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z INFO 9044 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9044 (nc00/sg02) [ModuleForkPass]: post_sched finished after 0.641 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=30mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc00/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.006 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.015 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13448 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.017 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:53Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.671 seconds +2025-11-04T21:38:53Z INFO 9044 [BackendPassManager]: curr_vmrss: 400mb, ru_maxrss: 413mb (delta=30mb) +2025-11-04T21:38:53Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:53Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38294 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:53Z USER 9044 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:53Z INFO 9044 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=2497 blocks=2 instructions=6623 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9044 (sg01) [SubgraphForkPass]: curr_vmrss: 400mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2497 memory location(s), 2 block(s), and 6623 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:53Z INFO 9044 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5674 blocks=2 instructions=27614 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=2309 blocks=2 instructions=4057 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9044 (sg02) [SubgraphForkPass]: curr_vmrss: 399mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5674 memory location(s), 2 block(s), and 27614 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 399mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2309 memory location(s), 2 block(s), and 4057 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:53Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.007 seconds +2025-11-04T21:38:53Z INFO 9044 [BackendPassManager]: curr_vmrss: 399mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:53Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38294 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2644 blocks=1 instructions=13448 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 109 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 33 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 121 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 109 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 33 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 131 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 121 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 70 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 20 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 41 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 62 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 136 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 70 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 101 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 62 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 42 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 108 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 106 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.092 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 69 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 103 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.101 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 16 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 45 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 70 Sb address +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.030 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.026 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 106 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.022 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 13Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [build_flow_deps]: Allocs: 1154 instructions: 2027 +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.028 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 14Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [build_flow_deps]: Allocs: 1155 instructions: 2030 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.163 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 107 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 4629 edges +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [build_flow_deps]: Done build fdeps 4629 Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 4632 edges +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [build_flow_deps]: Done build fdeps 4632 Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.021 seconds +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.018 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 8 │ 2489319424 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 53504 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10496516 │ +│ Load │ Internal │ 94 │ 7602176 │ +│ Save │ Internal │ 66 │ 7340032 │ +│ Save │ Internal -> Output │ 7 │ 2359298 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 2 │ +│ 256 │ 98 │ +│ 512 │ 1 │ +│ 1024 │ 52 │ +│ 2048 │ 17 │ +│ 4096 │ 30 │ +│ 1048576 │ 32 │ +│ 4194304 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 913 #MatMult-Transposes 225 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ReportStats]: IO Tensor size combined: 457978372 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input0 │ ExternalInput │ int32 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate0 │ Output │ bfloat16 │ 4194304 │ +│ intermediate3 │ Output │ bfloat16 │ 4194304 │ +│ intermediate3-buffer-2754 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1 │ Internal │ bfloat16 │ 4194304 │ +│ dot.4-buffer-2752 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.29 │ Internal │ bfloat16 │ 2097152 │ +│ reshape.24 │ Internal │ bfloat16 │ 2097152 │ +│ transpose.1 │ Internal │ bfloat16 │ 2097152 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 2097152 │ +│ reshape.16 │ Internal │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z USER 9044 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2030 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 402mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 8 │ 2489319424 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 53504 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10496516 │ +│ Load │ Internal │ 94 │ 7602176 │ +│ Save │ Internal │ 66 │ 7340032 │ +│ Save │ Internal -> Output │ 6 │ 2359296 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 2 │ +│ 256 │ 98 │ +│ 512 │ 1 │ +│ 1024 │ 52 │ +│ 2048 │ 17 │ +│ 4096 │ 30 │ +│ 1048576 │ 32 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 913 #MatMult-Transposes 225 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ReportStats]: IO Tensor size combined: 457978372 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input0 │ ExternalInput │ int32 │ 4096 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate0 │ Output │ bfloat16 │ 4194304 │ +│ intermediate3 │ Output │ bfloat16 │ 4194304 │ +│ intermediate3-buffer-2754 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1 │ Internal │ bfloat16 │ 4194304 │ +│ dot.4-buffer-2752 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.29 │ Internal │ bfloat16 │ 2097152 │ +│ reshape.24 │ Internal │ bfloat16 │ 2097152 │ +│ transpose.1 │ Internal │ bfloat16 │ 2097152 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 2097152 │ +│ reshape.16 │ Internal │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z USER 9044 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 403mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2027 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.198 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 403mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.046 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.040 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 406mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.035 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 406mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 15Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [build_flow_deps]: Allocs: 1248 instructions: 3310 +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.014 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 406mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 16Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [build_flow_deps]: Allocs: 1249 instructions: 3313 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 697 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 8664 edges +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [build_flow_deps]: Done build fdeps 8664 Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 786 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 8661 edges +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [build_flow_deps]: Done build fdeps 8661 Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: dep_opt finished after 0.032 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 406mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 65536 │ +│ Load │ ExternalInput -> Internal │ 171 │ 48243204 │ +│ Load │ Input -> Internal │ 6 │ 524288 │ +│ Load │ Internal │ 84 │ 9437184 │ +│ Save │ Internal │ 68 │ 8388608 │ +│ Save │ Internal -> Output │ 5 │ 2097154 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 4 │ +│ 256 │ 97 │ +│ 512 │ 4 │ +│ 1024 │ 130 │ +│ 2048 │ 8 │ +│ 4096 │ 88 │ +│ 1048576 │ 32 │ +│ 2097152 │ 3 │ +│ 4194304 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ReportStats]: MM Stats: #MatMults 2100 #MatMult-Transposes 248 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Input │ bfloat16 │ 4194304 │ +│ dot.7-buffer-2414 │ Internal │ bfloat16 │ 4194304 │ +│ dot.11-buffer-2419 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate5 │ Output │ bfloat16 │ 4194304 │ +│ intermediate0 │ Input │ bfloat16 │ 4194304 │ +│ all_reduce.1-buffer-2416 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate6-buffer-2421 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate6 │ Output │ bfloat16 │ 4194304 │ +│ add.4 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.60 │ Internal │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z USER 9044 (nc00/sg01) [ModuleForkPass]: report_stats finished after 0.008 seconds +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 404mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3313 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: dep_opt finished after 0.043 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 404mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 65536 │ +│ Load │ ExternalInput -> Internal │ 171 │ 48243204 │ +│ Load │ Input -> Internal │ 6 │ 524288 │ +│ Load │ Internal │ 84 │ 9437184 │ +│ Save │ Internal │ 68 │ 8388608 │ +│ Save │ Internal -> Output │ 4 │ 2097152 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 4 │ +│ 256 │ 97 │ +│ 512 │ 4 │ +│ 1024 │ 130 │ +│ 2048 │ 8 │ +│ 4096 │ 88 │ +│ 1048576 │ 32 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ReportStats]: MM Stats: #MatMults 2100 #MatMult-Transposes 248 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Input │ bfloat16 │ 4194304 │ +│ dot.7-buffer-2414 │ Internal │ bfloat16 │ 4194304 │ +│ dot.11-buffer-2419 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate5 │ Output │ bfloat16 │ 4194304 │ +│ intermediate0 │ Input │ bfloat16 │ 4194304 │ +│ all_reduce.1-buffer-2416 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate6-buffer-2421 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate6 │ Output │ bfloat16 │ 4194304 │ +│ add.4 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.60 │ Internal │ bfloat16 │ 2097152 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:53Z USER 9044 (nc01/sg01) [ModuleForkPass]: report_stats finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 404mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3310 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 13 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 13 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 46 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 97 PSUM Banks +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 55 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 38 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 9 Sb address +2025-11-04T21:38:53Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 37 Sb address +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.618 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 405mb, ru_maxrss: 413mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13448 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2644 blocks=1 instructions=13448 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 64 Sb address +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.088 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 419mb, ru_maxrss: 419mb (delta=6mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.720 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 414mb, ru_maxrss: 419mb (delta=6mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13448 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2644 blocks=1 instructions=13448 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.020 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 408mb, ru_maxrss: 419mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13448 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2644 blocks=1 instructions=13448 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 17Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [build_flow_deps]: Allocs: 2644 instructions: 13448 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 35064 edges +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [build_flow_deps]: Done build fdeps 35064 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: dep_opt finished after 0.102 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 421mb, ru_maxrss: 421mb (delta=2mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13448 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2644 blocks=1 instructions=13448 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal │ 2 │ 4194304 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 448 │ 193345548 │ +│ Load │ Internal │ 20 │ 6294662 │ +│ Save │ Internal │ 312 │ 6444544 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 97 │ +│ 2048 │ 1 │ +│ 4096 │ 370 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ReportStats]: MM Stats: #MatMults 11178 #MatMult-Transposes 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ReportStats]: IO Tensor size combined: 348925968 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 4096 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ convert.53 │ Internal │ bfloat16 │ 4194304 │ +│ all_reduce.3-buffer-2033 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate84 │ Input │ bfloat16 │ 4194304 │ +│ dot.14-buffer-2031 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate83 │ Input │ bfloat16 │ 4194304 │ +│ add.9 │ Internal │ bfloat16 │ 4194304 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ all_reduce.3_pftranspose_1000-t1614_i3 │ Internal │ bfloat16 │ 1048576 │ +│ all_reduce.3_pftranspose_1000-t1614_i2 │ Internal │ bfloat16 │ 1048576 │ +│ add.9_pftranspose_996-t1610_i3 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: report_stats finished after 0.016 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 420mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13448 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.150 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 416mb, ru_maxrss: 421mb (delta=2mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.030 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 18Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [build_flow_deps]: Allocs: 3030 instructions: 14166 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 45387 edges +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [build_flow_deps]: Done build fdeps 45387 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: dep_opt finished after 0.127 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 410mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal │ 4 │ 4194304 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 448 │ 193345548 │ +│ Load │ Internal │ 34 │ 6613898 │ +│ Save │ Internal │ 329 │ 6459911 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 113 │ +│ 2048 │ 2 │ +│ 4096 │ 370 │ +│ 9496 │ 2 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ReportStats]: MM Stats: #MatMults 11302 #MatMult-Transposes 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ReportStats]: IO Tensor size combined: 348925968 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input1 │ ExternalInput │ int32 │ 4096 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ add.9 │ Internal │ bfloat16 │ 4194304 │ +│ convert.53 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate84 │ Input │ bfloat16 │ 4194304 │ +│ dot.14-buffer-2031 │ Internal │ bfloat16 │ 4194304 │ +│ intermediate83 │ Input │ bfloat16 │ 4194304 │ +│ all_reduce.3-buffer-2033 │ Internal │ bfloat16 │ 4194304 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ -t3025 │ Internal │ float32 │ 1048576 │ +│ -t3019 │ Internal │ float32 │ 1048576 │ +│ -t3014 │ Internal │ float32 │ 1048576 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: report_stats finished after 0.014 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14166 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 1.055 seconds +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=8mb) +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Inputs to assign_trigger_engine: modules=6 functions=6 allocs=10480 blocks=6 instructions=38294 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 85 DMA instructions. Moved 19 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 84 DMA instructions. Moved 18 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [AssignTriggerEngine]: Assigned trigger engine for 73 DMA instructions. Moved 5 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [AssignTriggerEngine]: Assigned trigger engine for 72 DMA instructions. Moved 4 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [AssignTriggerEngine]: Assigned trigger engine for 336 DMA instructions. Moved 7 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [AssignTriggerEngine]: Assigned trigger engine for 317 DMA instructions. Moved 5 DMA instructions to CC's engines. +2025-11-04T21:38:54Z INFO 9044 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: assign_trigger_engine finished after 0.020 seconds +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Output has 6 module(s), 6 function(s), 10480 memory location(s), 6 block(s), and 38294 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38294 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2644 blocks=1 instructions=13448 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=3030 blocks=1 instructions=14166 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.003 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13451 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.017 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14169 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=1154 blocks=1 instructions=2027 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2029 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=1249 blocks=1 instructions=3313 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3315 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=1248 blocks=1 instructions=3310 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=1155 blocks=1 instructions=2030 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2032 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3312 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.027 seconds +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=6 functions=6 allocs=10480 blocks=6 instructions=38308 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: assign_hwdge_engine finished after 0.006 seconds +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Output has 6 module(s), 6 function(s), 10480 memory location(s), 6 block(s), and 38308 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38308 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=3030 blocks=1 instructions=14169 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2644 blocks=1 instructions=13451 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 11 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 3 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 298 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 1 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 9 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 444 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 12 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 7 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 28 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 9 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 10 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 301 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 14 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 444 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 12 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13451 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2644 blocks=1 instructions=13451 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14169 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=3030 blocks=1 instructions=14169 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.008 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13451 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=1155 blocks=1 instructions=2032 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=1154 blocks=1 instructions=2029 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=1249 blocks=1 instructions=3315 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=1248 blocks=1 instructions=3312 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 4 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 32 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 98 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 72 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 66 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 32 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 98 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 71 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 66 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2029 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2032 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=1155 blocks=1 instructions=2032 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=1154 blocks=1 instructions=2029 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 32 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 89 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 213 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 68 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: alloc_queues finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3315 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=1249 blocks=1 instructions=3315 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2029 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3315 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 32 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 89 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 212 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 68 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: alloc_queues finished after 0.004 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3312 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=1248 blocks=1 instructions=3312 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.019 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3312 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14169 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.010 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2032 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.028 seconds +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38308 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:54Z INFO 9044 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=5434 blocks=3 instructions=19516 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 5434 memory location(s), 3 block(s), and 19516 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:54Z INFO 9044 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=5046 blocks=3 instructions=18792 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 5046 memory location(s), 3 block(s), and 18792 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: nc_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38308 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=1155 blocks=1 instructions=2032 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=1248 blocks=1 instructions=3312 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2032 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2644 blocks=1 instructions=13451 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=1155 blocks=1 instructions=2032 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13451 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3312 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=1249 blocks=1 instructions=3315 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=3030 blocks=1 instructions=14169 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2644 blocks=1 instructions=13451 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=1154 blocks=1 instructions=2029 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2029 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14169 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=1154 blocks=1 instructions=2029 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=3030 blocks=1 instructions=14169 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=1248 blocks=1 instructions=3312 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.006 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2032 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=1155 blocks=1 instructions=2032 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3315 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=1249 blocks=1 instructions=3315 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 1964 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 2158 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 2158 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: lower_control finished after 0.009 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3315 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=1249 blocks=1 instructions=3315 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.018 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2029 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=1154 blocks=1 instructions=2029 Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: lower_control finished after 0.023 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 409mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3312 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=1248 blocks=1 instructions=3312 Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 1971 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 2163 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 2163 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: lower_control finished after 0.038 seconds +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: lower_control finished after 0.039 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 410mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14169 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=3030 blocks=1 instructions=14169 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 410mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13451 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2644 blocks=1 instructions=13451 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Finished dependency reduction: 8804 removed, new total 1134 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9044 (nc00/sg00) [ModuleForkPass]: dep_reduction finished after 0.044 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 410mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1155 memory location(s), 1 block(s), and 2032 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 3497 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 3525 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 3837 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 3837 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 3864 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 3864 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Finished dependency reduction: 8680 removed, new total 1132 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9044 (nc01/sg00) [ModuleForkPass]: dep_reduction finished after 0.080 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 414mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1154 memory location(s), 1 block(s), and 2029 instruction(s). Max writers: 34 Max Readers: 224 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 12501 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 13037 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 14186 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 14186 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Finished dependency reduction: 17210 removed, new total 1392 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9044 (nc00/sg01) [ModuleForkPass]: dep_reduction finished after 0.145 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 418mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1249 memory location(s), 1 block(s), and 3315 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 13281 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 13281 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Finished dependency reduction: 17306 removed, new total 1381 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9044 (nc01/sg01) [ModuleForkPass]: dep_reduction finished after 0.146 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 417mb, ru_maxrss: 421mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1248 memory location(s), 1 block(s), and 3312 instruction(s). Max writers: 34 Max Readers: 248 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Finished dependency reduction: 77784 removed, new total 4256 +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9044 (nc00/sg02) [ModuleForkPass]: dep_reduction finished after 0.279 seconds +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 424mb, ru_maxrss: 424mb (delta=3mb) +2025-11-04T21:38:54Z INFO 9044 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3030 memory location(s), 1 block(s), and 14169 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Finished dependency reduction: 60385 removed, new total 3432 +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:54Z USER 9044 (nc01/sg02) [ModuleForkPass]: dep_reduction finished after 0.295 seconds +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 423mb, ru_maxrss: 424mb (delta=3mb) +2025-11-04T21:38:54Z INFO 9044 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2644 memory location(s), 1 block(s), and 13451 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.359 seconds +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: curr_vmrss: 420mb, ru_maxrss: 424mb (delta=3mb) +2025-11-04T21:38:54Z USER 9044 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:54Z INFO 9044 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=10480 blocks=6 instructions=38308 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z USER 9044 (nc00) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:54Z USER 9044 (nc01) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:54Z INFO 9044 (nc00) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=5434 blocks=3 instructions=19516 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=5046 blocks=3 instructions=18792 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:54Z INFO 9044 (nc01/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:54Z INFO 9044 (nc00/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:54Z INFO 9044 (nc01/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:54Z INFO 9044 (nc01/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:54Z INFO 9044 (nc01/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:54Z INFO 9044 (nc00/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:54Z INFO 9044 (nc00/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:54Z INFO 9044 (nc00/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:54Z INFO 9044 (nc00/sgLnk) [BirLinker]: Added a new SpillReload Que qSPPIOParam0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg/nc00/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [BirLinker]: PostLink Stats: #MatMults 68915 #MatMult-Transposes 12163 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [BirLinker]: Total Intermediate MMTs 216 #out: 0 #inp: 216 #symmetric: 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg/nc01/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [BirLinker]: PostLink Stats: #MatMults 68791 #MatMult-Transposes 12163 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [BirLinker]: Total Intermediate MMTs 216 #out: 0 #inp: 216 #symmetric: 0 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: bir_linker finished after 0.393 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 627mb, ru_maxrss: 627mb (delta=203mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: bir_linker finished after 0.422 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=203mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 302725018, 83.4954% input load, 1.47211% output write, 15.0325% spill/reload +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: postlnk_dma_report finished after 0.004 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running report_stats +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 8 │ 2489319424 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 53504 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10496516 │ +│ Load │ Internal │ 94 │ 7602176 │ +│ Save │ Internal │ 66 │ 7340032 │ +│ Save │ Internal -> Output │ 6 │ 2359296 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 2 │ +│ 256 │ 98 │ +│ 512 │ 1 │ +│ 1024 │ 52 │ +│ 2048 │ 17 │ +│ 4096 │ 30 │ +│ 1048576 │ 32 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 65536 │ +│ Load │ ExternalInput -> Internal │ 171 │ 48243204 │ +│ Load │ Input -> Internal │ 6 │ 524288 │ +│ Load │ Internal │ 84 │ 9437184 │ +│ Save │ Internal │ 68 │ 8388608 │ +│ Save │ Internal -> Output │ 4 │ 2097152 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 4 │ +│ 256 │ 97 │ +│ 512 │ 4 │ +│ 1024 │ 130 │ +│ 2048 │ 8 │ +│ 4096 │ 88 │ +│ 1048576 │ 32 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal │ 2 │ 4194304 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 448 │ 193345548 │ +│ Load │ Internal │ 20 │ 6294662 │ +│ Save │ Internal │ 312 │ 6444544 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 97 │ +│ 2048 │ 1 │ +│ 4096 │ 370 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: MM Stats: #MatMults 14191 #MatMult-Transposes 5715 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: IO Tensor size combined: 6781430828 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 4194304 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: report_stats finished after 0.010 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 303375797, 83.4205% input load, 1.46896% output write, 15.1106% spill/reload +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: postlnk_dma_report finished after 0.008 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running report_stats +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:55Z INFO 9044 []: find first defs for local +2025-11-04T21:38:55Z INFO 9044 []: find first defs for global +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: Real CC buffer size 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 8 │ 2489319424 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 53504 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10496516 │ +│ Load │ Internal │ 94 │ 7602176 │ +│ Save │ Internal │ 66 │ 7340032 │ +│ Save │ Internal -> Output │ 7 │ 2359298 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 2 │ +│ 256 │ 98 │ +│ 512 │ 1 │ +│ 1024 │ 52 │ +│ 2048 │ 17 │ +│ 4096 │ 30 │ +│ 1048576 │ 32 │ +│ 4194304 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal -> ExternalOutput │ 32 │ 1073741824 │ +│ DMACopy │ Internal -> Output │ 1 │ 8388608 │ +│ DMACopy (Spill) │ Internal │ 32 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 65536 │ +│ Load │ ExternalInput -> Internal │ 171 │ 48243204 │ +│ Load │ Input -> Internal │ 6 │ 524288 │ +│ Load │ Internal │ 84 │ 9437184 │ +│ Save │ Internal │ 68 │ 8388608 │ +│ Save │ Internal -> Output │ 5 │ 2097154 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 64 │ 4 │ +│ 256 │ 97 │ +│ 512 │ 4 │ +│ 1024 │ 130 │ +│ 2048 │ 8 │ +│ 4096 │ 88 │ +│ 1048576 │ 32 │ +│ 2097152 │ 3 │ +│ 4194304 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 6291456 │ +│ DMACopy │ Internal │ 4 │ 4194304 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 448 │ 193345548 │ +│ Load │ Internal │ 34 │ 6613898 │ +│ Save │ Internal │ 329 │ 6459911 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 113 │ +│ 2048 │ 2 │ +│ 4096 │ 370 │ +│ 9496 │ 2 │ +│ 2097152 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: MM Stats: #MatMults 14315 #MatMult-Transposes 5715 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: IO Tensor size combined: 6781430828 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 4194304 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 4194304 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: report_stats finished after 0.019 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:55Z INFO 9044 []: find first defs for local +2025-11-04T21:38:55Z INFO 9044 []: find first defs for global +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.036 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: spill space = 235405368 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 235520000 bytes +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.038 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.091 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: spill space = 235405368 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 235520000 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: size = 86 +2025-11-04T21:38:55Z INFO 9044 []: find first defs for local +2025-11-04T21:38:55Z INFO 9044 []: find first defs for global +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 86 Num locations 86 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: lo = 86 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: total = 86 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 27262976 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 44576768 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.073 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:55Z USER 9044 [BackendPassManager]: nc_parallel_pass finished after 0.633 seconds +2025-11-04T21:38:55Z INFO 9044 [BackendPassManager]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=203mb) +2025-11-04T21:38:55Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:55Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=11508 blocks=8 instructions=38392 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:55Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=8 allocs=11508 blocks=8 instructions=38392 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 11508 memory location(s), 8 block(s), and 38392 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:55Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.008 seconds +2025-11-04T21:38:55Z INFO 9044 [BackendPassManager]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z USER 9044 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:55Z INFO 9044 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=8 allocs=11508 blocks=8 instructions=38392 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.037 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.044 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.022 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.029 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19558 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=5948 blocks=4 instructions=19558 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19565 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=5948 blocks=4 instructions=19565 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.029 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18834 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=5560 blocks=4 instructions=18834 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 432mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18841 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=5560 blocks=4 instructions=18841 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 5164/5164 (100% DGE) + power-of-2 partition : 5192/5255 (98.8011% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 5193/5256 (98.8014% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 4429/4433 (99.9098% DGE) + power-of-2 partition : 4429/4775 (92.7539% DGE) + > 3 dimensional : 0/4 (0% DGE) + non-integer desc size : 0/0 + total : 4429/4775 (92.7539% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 29 + Transpose : 896 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 904/904 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: lower_dma finished after 0.096 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19565 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=5948 blocks=4 instructions=19565 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: expand_all_engine finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19565 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=5948 blocks=4 instructions=19565 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.045 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19565 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=5948 blocks=4 instructions=19565 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 5164/5164 (100% DGE) + power-of-2 partition : 5164/5198 (99.3459% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 5165/5199 (99.346% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 4424/4428 (99.9097% DGE) + power-of-2 partition : 4424/4737 (93.3924% DGE) + > 3 dimensional : 0/4 (0% DGE) + non-integer desc size : 0/0 + total : 4424/4737 (93.3924% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 29 + Transpose : 896 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 904/904 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: lower_dma finished after 0.146 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18841 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=5560 blocks=4 instructions=18841 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: expand_all_engine finished after 0.015 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18841 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=5560 blocks=4 instructions=18841 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: expand_inst_late finished after 0.035 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19712 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=5948 blocks=4 instructions=19712 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [SeqInstOpt]: Removing 72 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [SeqInstOpt]: Removing 65 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 19575 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=5948 blocks=4 instructions=19575 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: lower_sync finished after 0.021 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 21099 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=5948 blocks=4 instructions=21099 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: lower_act finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 21113 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:55Z INFO 9044 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=5948 blocks=4 instructions=21113 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc00/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.054 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 433mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18841 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=5560 blocks=4 instructions=18841 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: expand_inst_late finished after 0.053 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 435mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18988 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=5560 blocks=4 instructions=18988 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [SeqInstOpt]: Removing 72 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [SeqInstOpt]: Removing 65 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z INFO 9044 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 436mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 18851 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:55Z USER 9044 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:55Z INFO 9044 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=5560 blocks=4 instructions=18851 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: lower_sync finished after 0.030 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 436mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 20220 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=5560 blocks=4 instructions=20220 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: lower_act finished after 0.012 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 436mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 20233 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=5560 blocks=4 instructions=20233 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:56Z USER 9044 (nc00) [CoreForkPass]: lower_dve finished after 0.161 seconds +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 437mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 21113 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=5948 blocks=4 instructions=21113 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc00) [CoreForkPass]: lower_ap finished after 0.011 seconds +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 437mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 21113 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=5948 blocks=4 instructions=21113 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9044 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9044 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:56Z INFO 9044 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9044 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9044 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9044 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: lower_dve finished after 0.179 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 438mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 20233 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=5560 blocks=4 instructions=20233 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: lower_ap finished after 0.010 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 440mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 20233 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=5560 blocks=4 instructions=20233 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9044 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9044 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z USER 9044 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.168 seconds +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: curr_vmrss: 440mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9044 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 21113 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:56Z INFO 9044 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9044 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:56Z INFO 9044 []: find first defs for local reg +2025-11-04T21:38:56Z INFO 9044 []: find first defs for global reg +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:56Z USER 9044 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.093 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 20233 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: nc_parallel_pass finished after 0.749 seconds +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: vnc_remote_addr_map finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Output has 2 module(s), 8 function(s), 11508 memory location(s), 8 block(s), and 41346 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: Running vnc_link +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 [VncLink]: Found 0 remote updates +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: vnc_link finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Output has 2 module(s), 8 function(s), 11508 memory location(s), 8 block(s), and 41346 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:56Z USER 9044 (nc00/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=5560 blocks=4 instructions=20233 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=5948 blocks=4 instructions=21113 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc00/sgLnk) [ModuleForkPass]: birverifier finished after 0.095 seconds +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 21113 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc01/sgLnk) [ModuleForkPass]: birverifier finished after 0.141 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 20233 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.145 seconds +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:56Z INFO 9044 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.018 seconds +2025-11-04T21:38:56Z INFO 9044 (sg00) [SubgraphForkPass]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 11508 memory location(s), 8 block(s), and 41346 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: subgraph_parallel_pass finished after 0.026 seconds +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: curr_vmrss: 441mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc00/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:56Z USER 9044 (nc01/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=5948 blocks=4 instructions=21113 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=5560 blocks=4 instructions=20233 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89233 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000533108 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89233 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000535022 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 14299 │ +│ LDWEIGHTS │ 14281 │ +│ EVENT_SEMAPHORE │ 1369 │ +│ UNKNOWN(0xd4) │ 1069 │ +│ CAST │ 770 │ +│ COPY │ 740 │ +│ ACTIVATE │ 515 │ +│ TENSOR_TENSOR │ 463 │ +│ PSEUDO_DMA_TRIGGER │ 389 │ +│ UNKNOWN(0xd3) │ 145 │ +│ TENSOR_SCALAR_ADDR │ 145 │ +│ UNKNOWN(0x9a) │ 96 │ +│ UNKNOWN(0x9b) │ 96 │ +│ MEMSET │ 93 │ +│ UNKNOWN(0xda) │ 80 │ +│ UNKNOWN(0x92) │ 72 │ +│ TENSOR_REDUCE │ 69 │ +│ RECIPROCAL │ 65 │ +│ UNKNOWN(0x24) │ 64 │ +│ UNKNOWN(0xd8) │ 50 │ +│ TENSOR_SCALAR │ 38 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ LOAD_MASK_SELECT │ 16 │ +│ STREAM_SHUFFLE │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 13 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ MOVE │ 7 │ +│ UNKNOWN(0xe8) │ 5 │ +│ ALU_OP │ 2 │ +│ IOTA │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 1807 │ +│ Scalar │ 2995 │ +│ Tensor │ 28810 │ +│ SyncDMA │ 0 │ +│ Vector │ 1162 │ +│ Sync │ 272 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:56Z USER 9044 (nc01/sgLnk) [Codegen]: isa_gen finished after 0.306 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 596 │ +│ qDVESpillReload0_defId_2 │ 2 │ +│ qPoolSpillReload0_defId_0 │ 49152 │ +│ qPoolSpillReload0_defId_1 │ 49152 │ +│ qPoolSpillReload0_defId_2 │ 7 │ +│ qSPIO0 │ 43092 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 4110 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 146113 (0.00217725 GB) +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2766-0_b2_grp_7_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 8 │ +│ I-2766-0_grp_7_sec_0_mhlo_exponential_6_b2_i0_sg0000 │ Internal │ bfloat16 │ 8 │ +│ I-2433-0_b0_grp_5_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 8 │ +│ I-2766-0_grp_6_sec_0_mhlo_exponential_6_b2_i0_sg0000 │ Internal │ bfloat16 │ 8 │ +│ I-2766-0_grp_5_sec_0_mhlo_exponential_6_b0_i0_sg0000 │ Internal │ bfloat16 │ 8 │ +│ I-2766-0_b2_grp_4_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 8 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ compare.2.1758_sg0001 │ Internal │ int32 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 297 │ +└──────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9044 (nc01/sgLnk) [Codegen]: dma_desc_gen finished after 0.027 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 14547 │ +│ LDWEIGHTS │ 14529 │ +│ EVENT_SEMAPHORE │ 1524 │ +│ UNKNOWN(0xd4) │ 1076 │ +│ COPY │ 868 │ +│ CAST │ 770 │ +│ ACTIVATE │ 522 │ +│ TENSOR_TENSOR │ 465 │ +│ PSEUDO_DMA_TRIGGER │ 427 │ +│ GATHER │ 291 │ +│ POOL_BUFFER_LOAD │ 291 │ +│ TENSOR_SCALAR_ADDR │ 145 │ +│ UNKNOWN(0xd3) │ 145 │ +│ DVE_READ_INDICES │ 128 │ +│ MAX8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MEMSET │ 107 │ +│ UNKNOWN(0x9a) │ 96 │ +│ UNKNOWN(0x9b) │ 96 │ +│ UNKNOWN(0xda) │ 80 │ +│ TENSOR_REDUCE │ 74 │ +│ UNKNOWN(0x92) │ 72 │ +│ RECIPROCAL │ 67 │ +│ UNKNOWN(0x24) │ 64 │ +│ UNKNOWN(0xd8) │ 50 │ +│ TENSOR_SCALAR │ 40 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ STREAM_SHUFFLE │ 20 │ +│ LOAD_MASK_SELECT │ 20 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 14 │ +│ MOVE │ 7 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ UNKNOWN(0xd9) │ 7 │ +│ UNKNOWN(0xe8) │ 5 │ +│ UNKNOWN(0xe5) │ 2 │ +│ ALU_OP │ 2 │ +│ IOTA │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 2450 │ +│ Scalar │ 3139 │ +│ Tensor │ 29309 │ +│ SyncDMA │ 0 │ +│ Vector │ 1802 │ +│ Sync │ 309 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:56Z USER 9044 (nc00/sgLnk) [Codegen]: isa_gen finished after 0.373 seconds +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 602 │ +│ qDVESpillReload0_defId_2 │ 142 │ +│ qPoolSpillReload0_defId_0 │ 49152 │ +│ qPoolSpillReload0_defId_1 │ 49152 │ +│ qPoolSpillReload0_defId_2 │ 207 │ +│ qSPIO0 │ 43094 │ +│ qSPPIOParam0 │ 56 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 4454 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 146861 (0.0021884 GB) +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qSPIO0 │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qSPPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 144 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2433-0_b3_grp_6_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 8 │ +│ I-2766-0_b0_grp_6_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 8 │ +│ I-2433-0_b3_grp_7_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 8 │ +│ I-2766-0_grp_5_sec_0_mhlo_exponential_6_b0_i0_sg0000 │ Internal │ bfloat16 │ 8 │ +│ I-2433-0_grp_4_sec_0_mhlo_exponential_6_b2_i0_sg0001 │ Internal │ bfloat16 │ 8 │ +│ all-reduce.465.2434_sg0001 │ Internal │ bfloat16 │ 27 │ +│ compare.2.1758_sg0001 │ Internal │ int32 │ 27 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 298 │ +└──────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9044 (nc01/sgLnk) [Codegen]: debug_info_gen finished after 0.066 seconds +2025-11-04T21:38:56Z USER 9044 (nc00/sgLnk) [Codegen]: dma_desc_gen finished after 0.026 seconds +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:56Z USER 9044 (nc01/sgLnk) [ModuleForkPass]: codegen finished after 0.425 seconds +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 5560 memory location(s), 4 block(s), and 20233 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 (nc00/sgLnk) [Codegen]: debug_info_gen finished after 0.049 seconds +2025-11-04T21:38:56Z USER 9044 (nc00/sgLnk) [ModuleForkPass]: codegen finished after 0.461 seconds +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 5948 memory location(s), 4 block(s), and 21113 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: mod_parallel_pass finished after 0.465 seconds +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: curr_vmrss: 462mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: Running hbm_usage +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [HBMUsage]: +┌───────────────┬───────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼───────────┼───────────────────┤ +│ Copy │ 1.344KB │ 61.312KB │ +│ CCE │ 672.000KB │ 48.000B │ +│ Transpose │ 0.000B │ 1.500MB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 16.000KB │ 110.250KB │ +└───────────────┴───────────┴───────────────────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc00/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.689GB │ +│ Model Code │ 2.259MB │ +│ Model Constants │ 561.012KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 42.508MB │ +│ DMA Ring IO │ 689.344KB │ +│ DMA Ring Spill │ 1.668MB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [HBMUsage]: +┌───────────────┬───────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼───────────┼───────────────────┤ +│ Copy │ 1.312KB │ 49.656KB │ +│ CCE │ 672.000KB │ 48.000B │ +│ Transpose │ 0.000B │ 1.500MB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 15.750KB │ 94.500KB │ +└───────────────┴───────────┴───────────────────┘ + +2025-11-04T21:38:56Z INFO 9044 (nc01/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.673GB │ +│ Model Code │ 2.139MB │ +│ Model Constants │ 559.004KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 26.000MB │ +│ DMA Ring IO │ 689.062KB │ +│ DMA Ring Spill │ 1.641MB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:56Z INFO 9044 [HBMUsage]: Total estimated HBM usage is: 3.719GB +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: hbm_usage finished after 0.004 seconds +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: curr_vmrss: 462mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Output has 2 module(s), 8 function(s), 11508 memory location(s), 8 block(s), and 41346 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z USER 9044 [BackendPassManager]: Running neff_packager +2025-11-04T21:38:56Z INFO 9044 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=8 allocs=11508 blocks=8 instructions=41346 Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1736_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1653-1738_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1664-1740_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2051_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2038_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1543-1639_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1554-1641_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1565-1643_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1575-1645_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1783_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.27-1134-1355_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1563_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1736_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1653-1738_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1664-1740_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2051_CRSM.npy +2025-11-04T21:38:56Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2038_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1543-1639_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1554-1641_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1565-1643_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1575-1645_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1783_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1563_CRSM.npy +2025-11-04T21:38:57Z INFO 9044 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:57Z WARNING 9044 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg/metrics.json +2025-11-04T21:38:57Z WARNING 9044 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:38:57Z INFO 9044 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff +2025-11-04T21:38:57Z INFO 9044 [NeffFileWriter]: IR signature: 94edbc79dbeab9e50a2627ad929d62b4 for neff artifacts +2025-11-04T21:38:57Z USER 9044 [BackendPassManager]: neff_packager finished after 0.180 seconds +2025-11-04T21:38:57Z INFO 9044 [BackendPassManager]: curr_vmrss: 463mb, ru_maxrss: 627mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9044 [BackendPassManager]: Output has 2 module(s), 8 function(s), 11508 memory location(s), 8 block(s), and 41346 instruction(s). Max writers: 299 Max Readers: 5242 +2025-11-04T21:38:57Z INFO 9044 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.021484 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.021484 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local and shared │ 0.025391 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: shared │ 0.027344 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.001953 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local and shared │ 0.013981 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.001980 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: shared │ 0.015965 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.001953 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.025391 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.041515 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.219345 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg02 │ Peak scratchpad usage: local │ 0.001953 GB │ +│ nc01 │ sg02 │ Total size of allocated tensors: local │ 0.001953 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.001953 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.041515 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9044 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ all_gather.1 │ bfloat16 │ 1 │ 4.000000 MB │ +│ reshape.16 │ bfloat16 │ 1 │ 2.000000 MB │ +│ reshape.24 │ bfloat16 │ 1 │ 2.000000 MB │ +│ reshape.29 │ bfloat16 │ 1 │ 2.000000 MB │ +│ transpose.1 │ bfloat16 │ 1 │ 2.000000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9044 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg02, addr_space=local (complete data located at nc00/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬───────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼───────┼───────────────┼─────────────┤ +│ reshape.104 │ int32 │ 1 │ 0.000008 MB │ +└────────────────────────────────────────────────────────────────┴───────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9044 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc00 (complete data located at nc00//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.250000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.250000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9044 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc01 (complete data located at nc01//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 4.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.250000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.250000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:57Z INFO 9044 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:38:57Z INFO 8594 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:38:57Z INFO 8594 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:38:57Z INFO 8594 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg +2025-11-04T21:38:57Z INFO 8594 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:38:57Z INFO 8594 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:38:57Z INFO 8594 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:38:57Z INFO 8594 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:38:57Z INFO 8594 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:38:57Z INFO 8594 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk3/neuronxcc-miwah3fg/hlo_netlist.json +2025-11-04T21:38:57Z INFO 8594 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:38:57Z INFO 8594 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:38:57Z INFO 8594 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:38:57Z INFO 8576 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk3/metaneff.pb b/context_encoding_model/_tp0_bk3/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..6f537eb3ebc4a4d9ad90124aefd90aae6881c2af --- /dev/null +++ b/context_encoding_model/_tp0_bk3/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a788ec9ea41bfa0696307ae7b82f6644a908b0b0a1feb7f30da3ca4349d0c13 +size 2955932 diff --git a/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb b/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..0c09755f13cb2f4f4c750297fadd5545eaf32cce --- /dev/null +++ b/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:616f0c948889cd427dac21bbe629a046747b018871cf2815b3477d1f3d54d269 +size 3042718 diff --git a/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff b/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff new file mode 100644 index 0000000000000000000000000000000000000000..2892be0add6c6c5e79e96e6df95caf2d4133a007 --- /dev/null +++ b/context_encoding_model/_tp0_bk3/model.MODULE_be035899334776123ed5+d208bdce.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdea9302d0f9d0785d148992ac29a3b377a867a1a9ce89c40e3ccad020e4ef73 +size 1506304 diff --git a/context_encoding_model/_tp0_bk3/neuron_config.json b/context_encoding_model/_tp0_bk3/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..43eeb83cb0939cab35789d5a18befed95b100a3c --- /dev/null +++ b/context_encoding_model/_tp0_bk3/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 1024 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 1024 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 4096, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk4/command.txt b/context_encoding_model/_tp0_bk4/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..3dbab82a3b124381dd3b27614e671341ce069d17 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb --output model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk4/compile_flags.MODULE_95ef7ca73cc0a6161be2+96be3c33.json b/context_encoding_model/_tp0_bk4/compile_flags.MODULE_95ef7ca73cc0a6161be2+96be3c33.json new file mode 100644 index 0000000000000000000000000000000000000000..d4ddc4acaca97a666be38e6d989700f4adcf649a --- /dev/null +++ b/context_encoding_model/_tp0_bk4/compile_flags.MODULE_95ef7ca73cc0a6161be2+96be3c33.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk4/global_metric_store.json b/context_encoding_model/_tp0_bk4/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..e9591ca701bc6c5bd299d57e029f5f851cc847c3 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/global_metric_store.json @@ -0,0 +1,1177 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.93502807617188, + "StaticProfiler::AveragePartitionUtilization": 95.0970230102539, + "StaticProfiler::AveragePeUtilization": 97.18069458007813, + "StaticProfiler::LocalizationEfficiency": 73.73954010009766, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 79.92718505859375, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.04760026931762695, + "AffinePredicateResolution": 0.003319978713989258, + "AliasDependencyElimination": 0.0002167224884033203, + "AliasDependencyInduction": 0.008548259735107422, + "AliasDependencyReset": 0.03149843215942383, + "BFComputeCutting": 0.00810694694519043, + "BirCodeGenLoop": 0.2911098003387451, + "CCOpFusion": 0.08548593521118164, + "CanonicalizeConv": 2.7000001864507794e-05, + "CanonicalizeDAGForPGTiling": 0.007600545883178711, + "CanonicalizeForTensorizer": 5.699999746866524e-05, + "CanonicalizeIR": 0.0030400753021240234, + "Canonicalizer": 0.0011950000189244747, + "CoalesceCCOp": 0.020453453063964844, + "CommuteConcat": 0.007961034774780273, + "DMALocalityOpt": 0.016626834869384766, + "DMAProfiler": 0.018386363983154297, + "DMATilingProfiler": 0.009016752243041992, + "DataLocalityOpt": 0.17029356956481934, + "DataStreaming": 0.03981828689575195, + "DeConcat": 0.01120138168334961, + "DeadCodeElimination": 0.010882377624511719, + "DeadStoreElimination": 0.010195016860961914, + "DelinearIndices": 0.010077953338623047, + "Delinearization": 0.011870861053466797, + "DelinearizeSPMD": 0.035944223403930664, + "DoNothing": 0.0005605220794677734, + "DramToDramTranspose": 0.013046979904174805, + "DumpGraphAndMetadata": 0.03416705131530762, + "EliminateDivs": 0.004259586334228516, + "ExpandBatchNorm": 0.0017371177673339844, + "ExpandISAMacro": 0.014496326446533203, + "FactorizeBlkDims": 0.07086968421936035, + "FactorizeThreadAxesInFreeDims": 0.00911855697631836, + "FlattenMacroLoop": 0.0048520565032958984, + "GenericAccessSimplifier": 0.001367330551147461, + "HoistCompute": 6.000000212225132e-06, + "IdentifyCrossPassTensors": 6.199999916134402e-05, + "InferInitValue": 0.0836641788482666, + "InferIntrinsicOnCC": 0.008740901947021484, + "InferNeuronTensor": 0.05709338188171387, + "InferNonlocalTensors": 0.041548728942871094, + "InferPSumTensor": 0.23330998420715332, + "InferShardAxis": 0.5781030654907227, + "InferSharedMemLoc": 0.03158235549926758, + "InlineNativeKernels": 0.002477407455444336, + "InsertCoreBarrier": 0.015990734100341797, + "InsertIOTransposes": 0.039937734603881836, + "InsertImplicitShardAxisBeforeISel": 0.013466596603393555, + "InsertLocalTransposes": 0.018125534057617188, + "InsertOffloadedTransposes": 0.014874696731567383, + "LICM": 0.0058231353759765625, + "LateLegalizeInst": 0.037004947662353516, + "LateLegalizePostSplit": 0.02429652214050293, + "LateLowerReshapeOp": 0.0018832683563232422, + "LateLowerTensorOp": 0.0021920204162597656, + "LateNeuronInstComb": 0.06391644477844238, + "LayoutPreprocessing": 0.06973385810852051, + "LayoutPreprocessingAndAnalysis": 0.11140203475952148, + "LayoutRequirementAnalysis": 0.013022661209106445, + "LegalizeCCOpLayout": 0.0020427703857421875, + "LegalizeOpLevelAlias": 0.0016918182373046875, + "LegalizePartitionReduce": 0.0030241012573242188, + "LegalizeSundaAccess": 0.08372640609741211, + "LegalizeSundaMacro": 0.02708148956298828, + "LegalizeType": 0.04078388214111328, + "LocalLayoutOpt": 0.022045135498046875, + "LoopFusion": 0.029404163360595703, + "LoopSplitting": 0.0007355213165283203, + "LowerBroadcast": 0.02869558334350586, + "LowerCCOpBlockAxis": 0.007714748382568359, + "LowerComplexBroadcast": 0.005654096603393555, + "LowerIntrinsics": 0.051032304763793945, + "LowerShardAxis": 0.03305673599243164, + "LowerTensorOp": 0.028458356857299805, + "LowerToSendRecv": 0.03391242027282715, + "LowerTranspose": 0.051642656326293945, + "MacroGeneration": 0.06428074836730957, + "MaskPropagation": 0.0036263465881347656, + "MemcastMotion": 1.700000029813964e-05, + "MemcpyElimination": 0.05451250076293945, + "MutateDataType": 0.001516103744506836, + "NeuronAliasDependencyInduction": 0.0005834102630615234, + "NeuronAliasDependencyReset": 0.022034168243408203, + "NeuronInstComb": 0.06097984313964844, + "NeuronLICM": 0.05481839179992676, + "NeuronLoopFusion": 0.07339620590209961, + "NeuronLoopInterchange": 0.0027348995208740234, + "NeuronSimplifier": 0.021918296813964844, + "NeuronSimplifyPredicates": 0.024098873138427734, + "NeuronValueNumbering": 0.022985458374023438, + "OptimizeAliasedCopyChain": 0.0008976459503173828, + "OptimizeNKIKernels": 4.611967086791992, + "PAGLayoutOpt": 0.2917053699493408, + "PComputeCutting": 0.008776664733886719, + "PGLayoutTilingPipeline": 1.8517823219299316, + "PGTiling": 0.26313185691833496, + "PadElimination": 0.0006458759307861328, + "ParAxesAnnotation": 0.188338041305542, + "PartialLoopFusion": 0.05682229995727539, + "PartialSimdFusion": 0.0237729549407959, + "PenguinizeFunctions": 5.5999997130129486e-05, + "PerfectLoopNest": 0.00557398796081543, + "PruneFunctions": 3.9999998989515007e-05, + "RecognizeOpIdiom": 0.008669376373291016, + "Recompute": 0.0005908012390136719, + "RelaxPredicates": 0.006473541259765625, + "Rematerialization": 0.011237144470214844, + "RemoveOptimizationBarriers": 7.400000322377309e-05, + "RemoveShardedPartitionAxes": 0.014671802520751953, + "ReshapeWeights": 0.0018546581268310547, + "ResolveAccessConflict": 0.008959770202636719, + "ResolveComplicatePredicates": 0.0009264945983886719, + "RewriteReplicationMatmul": 0.0037200450897216797, + "RewriteWeights": 0.008005380630493164, + "SFKVectorizer": 0.2967853546142578, + "ScatterMotion": 1.900000097521115e-05, + "ShardingPropagationAnalysis": 0.10689902305603027, + "SimpleAllReduceTiling": 0.010908842086791992, + "Simplifier": 0.00808858871459961, + "SimplifyMacroPredicates": 0.031823158264160156, + "SimplifyNeuronTensor": 0.12780547142028809, + "SimplifySlice": 0.001531362533569336, + "SimplifyTensor": 0.018309593200683594, + "SpillPSum": 0.09417366981506348, + "SplitAPUnionSets": 0.09693408012390137, + "SplitAccGrp": 0.0025701522827148438, + "StaticProfiler": 0.04053521156311035, + "StaticTransposeLocalTensor": 0.012635231018066406, + "SundaISel": 0.10333561897277832, + "TCTransform": 0.006776332855224609, + "TensorInitialization": 0.011014938354492188, + "TensorOpSimplifier": 0.005452632904052734, + "TensorOpTransform": 0.033481597900390625, + "TensorizerLegalizationPass": 6.399999983841553e-05, + "TileCCOps": 0.011636972427368164, + "TilingProfiler": 0.024947643280029297, + "TransformConvOp": 0.013001441955566406, + "TritiumFusion": 0.1458723545074463, + "ValueNumbering": 0.003311634063720703, + "VectorizeDMA": 0.005986928939819336, + "VectorizeMatMult": 0.028806686401367188, + "VerifySupportedOps": 5.100000271340832e-05, + "WeightCoalescing": 0.01451730728149414, + "ZeroSizeTensorElimination": 0.00017833709716796875, + "algsimp": 0.0020910000894218683, + "batchnorm_expander": 5.0000002374872565e-05, + "boundary-marker-removal": 1.900000097521115e-05, + "call-inliner": 0.00046300000394694507, + "canonicalize-boundary-marker": 2.300000051036477e-05, + "collective-stream-id-checker": 8.800000068731606e-05, + "comparison-expander": 0.0005719999899156392, + "computation-deduplicator": 8.399999933317304e-05, + "config-lowering": 0.00016599999798927456, + "constant-statistics": 0.0004529999860096723, + "constant_folding": 0.00018699999782256782, + "cse": 6.299999949987978e-05, + "dce": 4.400000034365803e-05, + "dot_decomposer": 0.001028000027872622, + "dynamic-slice-transpose": 2.0000001313746907e-05, + "eliminate-redundant-compare": 0.00014699999883305281, + "emit-offloaded-dropout": 5.499999679159373e-05, + "flatten-call-graph": 0.0006470000371336937, + "fuse-send-recv": 9.600000339560211e-05, + "hilo-conditional-to-select": 2.9000000722589903e-05, + "hilo::LegalizeAlias": 1.500000053056283e-05, + "hilo::NeuronInstCombine": 0.00012700000661425292, + "hilo::NeuronOpFusion": 4.099999932805076e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 7.400000322377309e-05, + "hilo::ScheduleFusion": 3.000000106112566e-06, + "hilo::SixtyFourHack": 9.599999611964449e-05, + "hilo::VerifyAliasing": 6.000000212225132e-06, + "hlo-mac-count": 0.015143999829888344, + "instruction-histogram": 0.0010160000529140234, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 0.0, + "io-layout-normalization": 0.0007440000190399587, + "io-statistics": 3.9999998989515007e-05, + "legalize-ccops-for-tensorizer": 6.000000212225132e-06, + "legalize-compare": 1.3999999282532372e-05, + "lower-argminmax-custom-call": 1.5999999959603883e-05, + "map-inline": 0.0008340000058524311, + "metadata-naming": 7.79999973019585e-05, + "mlir::detail::OpToOpPassAdaptor": 5.8000001445179805e-05, + "mlir::hlo::MhloToPyPenguin": 0.013376999646425247, + "mlir::mhlo::LowerComplexExtraPass": 0.00022300001000985503, + "mlir::mhlo::LowerComplexPass": 0.0004149999876972288, + "native-to-custom-softmax": 0.0003029999788850546, + "native-to-custom-softmax-dx": 0.0021089999936521053, + "neuron-hlo-verifier": 0.011952999979257584, + "operand_upcaster": 5.5999997130129486e-05, + "opt-barrier-removal": 0.00026000000070780516, + "post-par-pipe-begin": 0.0003480000013951212, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.002303000073879957, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.07090699672698975, + "replace-minimum-constant": 0.0003819999983534217, + "reshape-mover": 6.299999949987978e-05, + "simplify-concat": 0.00014800000644754618, + "simplify-while-loops": 9.100000170292333e-05, + "transform-variadic-reduce": 9.299999510403723e-05, + "tuple-simplifier": 0.0001649999903747812, + "unpack-nested-aws-ntwsr": 0.00024099998699966818, + "unroll-while-loop": 3.5000000934815034e-05, + "zero_sized_hlo_elimination": 0.00072900002123788 + }, + "hilo": { + "ConstantSize": 3678847.0, + "HloInputCount": 371.0, + "HloMacCount": 111825780736.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910928384.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 973052032.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 25519.0, + "StaticProfiler::AifUb": 337.1839904785156, + "StaticProfiler::ArithmeticIntensityTensorizer": 248.63792419433594, + "StaticProfiler::AverageDmaLength": 2413.602294921875, + "StaticProfiler::DDRTransferBytes": 495991840.0, + "StaticProfiler::InternalTransferBytes": 361682720.0, + "StaticProfiler::LoadExpanded": 133728.0, + "StaticProfiler::StoreExpanded": 7530.0, + "StaticProfiler::TotalDMAExpanded": 141258.0, + "StaticProfiler::TotalDynamicInstancesCount": 30781.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 30330.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 14112.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 10273.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 10.0, + "TilingProfiler::SimdInstructionsAfterTiling": 311.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.0017770000267773867, + "call-inliner": 0.00041700000292621553, + "collective-stream-id-checker": 5.500000042957254e-05, + "comparison-expander": 0.0005280000041238964, + "constant-statistics": 0.0004529999860096723, + "constant_folding": 0.0001429999974789098, + "dce": 3.899999865097925e-05, + "dot_decomposer": 0.001028000027872622, + "eliminate-redundant-compare": 0.0001320000010309741, + "flatten-call-graph": 0.0006070000235922635, + "hlo-mac-count": 0.007338999770581722, + "instruction-histogram": 0.0010160000529140234, + "io-con-pipe-begin": 4.999999873689376e-06, + "io-con-pipe-end": 0.0, + "io-layout-normalization": 0.0007440000190399587, + "io-statistics": 3.9999998989515007e-05, + "map-inline": 0.0007900000200606883, + "native-to-custom-softmax": 0.00028199999360367656, + "native-to-custom-softmax-dx": 0.00042699999175965786, + "neuron-hlo-verifier": 0.010262000374495983, + "opt-barrier-removal": 0.00026000000070780516, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.07090699672698975, + "replace-minimum-constant": 0.0003480000013951212, + "reshape-mover": 4.8999998398358e-05, + "simplify-while-loops": 8.099999831756577e-05, + "tuple-simplifier": 0.0001429999974789098, + "unpack-nested-aws-ntwsr": 0.00022499999613501132, + "unroll-while-loop": 1.2000000424450263e-05, + "zero_sized_hlo_elimination": 0.00072900002123788 + } + }, + "attention_isa_kernel": { + "compiletime": { + "CoalesceCCOp": 0.00029277801513671875, + "DMALocalityOpt": 0.00019669532775878906, + "DMAProfiler": 0.0002949237823486328, + "DataStreaming": 0.0002338886260986328, + "DoNothing": 0.0014209747314453125, + "ExpandISAMacro": 0.00028014183044433594, + "FactorizeBlkDims": 0.0051081180572509766, + "InferPSumTensor": 0.0036172866821289063, + "InferSharedMemLoc": 0.0005719661712646484, + "InsertCoreBarrier": 0.0023279190063476563, + "LateLegalizeInst": 0.0016858577728271484, + "LateNeuronInstComb": 0.00044226646423339844, + "LegalizeSundaAccess": 0.0002193450927734375, + "LegalizeType": 0.002800464630126953, + "LowerBroadcast": 0.0002620220184326172, + "LowerIntrinsics": 0.0003139972686767578, + "LowerTranspose": 0.0002512931823730469, + "NeuronInstComb": 0.0005278587341308594, + "NeuronLICM": 0.0002562999725341797, + "NeuronSimplifyPredicates": 0.0002334117889404297, + "NeuronValueNumbering": 0.0002815723419189453, + "SFKVectorizer": 0.005394458770751953, + "SimpleAllReduceTiling": 0.0003223419189453125, + "SimplifyNeuronTensor": 0.0007545948028564453, + "SpillPSum": 0.0006477832794189453, + "WeightCoalescing": 0.00023102760314941406 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00034165382385253906, + "DMALocalityOpt": 0.0003287792205810547, + "DMAProfiler": 0.001161336898803711, + "DataStreaming": 0.0004813671112060547, + "DoNothing": 0.00018596649169921875, + "ExpandISAMacro": 0.0008256435394287109, + "FactorizeBlkDims": 0.0007493495941162109, + "InferPSumTensor": 0.0011432170867919922, + "InferSharedMemLoc": 0.00045013427734375, + "InsertCoreBarrier": 0.00044918060302734375, + "LateLegalizeInst": 0.0019235610961914063, + "LateNeuronInstComb": 0.0011394023895263672, + "LegalizeSundaAccess": 0.002297639846801758, + "LegalizeType": 0.00036334991455078125, + "LowerBroadcast": 0.0003592967987060547, + "LowerIntrinsics": 0.000362396240234375, + "LowerTranspose": 0.0003514289855957031, + "NeuronInstComb": 0.0034132003784179688, + "NeuronLICM": 0.0006377696990966797, + "NeuronSimplifyPredicates": 0.0035140514373779297, + "NeuronValueNumbering": 0.001703023910522461, + "SFKVectorizer": 0.009377241134643555, + "SimpleAllReduceTiling": 0.0003190040588378906, + "SimplifyNeuronTensor": 0.0036399364471435547, + "SpillPSum": 0.0008790493011474609, + "WeightCoalescing": 0.0003619194030761719 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 1.9999999949504854e-06, + "CanonicalizeForTensorizer": 1.8999999156221747e-05, + "Canonicalizer": 0.0004579999949783087, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 2.5999999706982635e-05, + "MemcastMotion": 9.999999747378752e-06, + "PenguinizeFunctions": 1.9999999494757503e-05, + "PruneFunctions": 1.2000000424450263e-05, + "RemoveOptimizationBarriers": 2.499999936844688e-05, + "ScatterMotion": 9.999999747378752e-06, + "TensorizerLegalizationPass": 3.199999991920777e-05, + "VerifySupportedOps": 1.700000029813964e-05, + "algsimp": 8.900000102585182e-05, + "batchnorm_expander": 1.700000029813964e-05, + "boundary-marker-removal": 6.000000212225132e-06, + "call-inliner": 1.2999999853491317e-05, + "canonicalize-boundary-marker": 7.999999979801942e-06, + "collective-stream-id-checker": 2.499999936844688e-05, + "comparison-expander": 6.000000212225132e-06, + "computation-deduplicator": 2.499999936844688e-05, + "config-lowering": 5.0999999075429514e-05, + "constant_folding": 1.2999999853491317e-05, + "cse": 1.9999999494757503e-05, + "dce": 1.9999999949504854e-06, + "dynamic-slice-transpose": 7.000000096013537e-06, + "eliminate-redundant-compare": 4.999999873689376e-06, + "emit-offloaded-dropout": 1.8999999156221747e-05, + "flatten-call-graph": 1.2000000424450263e-05, + "fuse-send-recv": 2.8000000384054147e-05, + "hilo-conditional-to-select": 7.999999979801942e-06, + "hilo::LegalizeAlias": 6.000000212225132e-06, + "hilo::NeuronInstCombine": 5.900000178371556e-05, + "hilo::NeuronOpFusion": 1.1000000085914508e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 3.5000000934815034e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.8000000636675395e-05, + "hilo::VerifyAliasing": 3.000000106112566e-06, + "hlo-mac-count": 0.00014400000509340316, + "legalize-ccops-for-tensorizer": 3.000000106112566e-06, + "legalize-compare": 4.999999873689376e-06, + "lower-argminmax-custom-call": 4.999999873689376e-06, + "map-inline": 1.4000000192027073e-05, + "metadata-naming": 2.4000000848900527e-05, + "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05, + "mlir::hlo::MhloToPyPenguin": 0.0029299999587237835, + "mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05, + "mlir::mhlo::LowerComplexPass": 0.00014200000441633165, + "native-to-custom-softmax": 7.999999979801942e-06, + "native-to-custom-softmax-dx": 0.0016329999780282378, + "neuron-hlo-verifier": 0.000598000013269484, + "operand_upcaster": 1.9999999494757503e-05, + "post-par-pipe-begin": 0.00034500000765547156, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0007699999841861427, + "replace-minimum-constant": 9.999999747378752e-06, + "reshape-mover": 4.999999873689376e-06, + "simplify-concat": 4.70000013592653e-05, + "simplify-while-loops": 3.000000106112566e-06, + "transform-variadic-reduce": 1.1000000085914508e-05, + "tuple-simplifier": 7.000000096013537e-06, + "unpack-nested-aws-ntwsr": 4.999999873689376e-06, + "unroll-while-loop": 9.999999974752427e-07 + }, + "hilo": { + "ArithmeticIntensity": 79.95455932617188, + "ConstantSize": 3678847.0, + "HloInputCount": 371.0, + "HloMacCount": 17179869184.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910928384.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 429740832.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.06203174591064453, + "AffinePredicateResolution": 0.001997709274291992, + "AliasDependencyElimination": 0.00024080276489257813, + "AliasDependencyInduction": 0.0331728458404541, + "AliasDependencyReset": 0.10205578804016113, + "BFComputeCutting": 0.007540702819824219, + "BirCodeGenLoop": 0.15983891487121582, + "CCOpFusion": 0.06544995307922363, + "CanonicalizeDAGForPGTiling": 0.004024982452392578, + "CanonicalizeIR": 0.001623392105102539, + "CoalesceCCOp": 0.011837482452392578, + "CommuteConcat": 0.009541988372802734, + "DMALocalityOpt": 0.0019822120666503906, + "DMAProfiler": 0.007272958755493164, + "DMATilingProfiler": 0.007293224334716797, + "DataLocalityOpt": 0.2593100070953369, + "DataStreaming": 0.0239105224609375, + "DeConcat": 0.005833864212036133, + "DeadCodeElimination": 0.00394749641418457, + "DeadStoreElimination": 0.07077240943908691, + "DelinearIndices": 0.02637171745300293, + "Delinearization": 0.01995396614074707, + "DelinearizeSPMD": 0.03704118728637695, + "DoNothing": 9.799003601074219e-05, + "DramToDramTranspose": 0.03482198715209961, + "DumpGraphAndMetadata": 0.01542520523071289, + "EliminateDivs": 0.005273103713989258, + "ExpandBatchNorm": 0.0026073455810546875, + "ExpandISAMacro": 0.008665800094604492, + "FactorizeBlkDims": 0.061437368392944336, + "FactorizeThreadAxesInFreeDims": 0.002484560012817383, + "FlattenMacroLoop": 0.008157968521118164, + "GenericAccessSimplifier": 0.0014643669128417969, + "InferInitValue": 0.08534860610961914, + "InferIntrinsicOnCC": 0.01716780662536621, + "InferNeuronTensor": 0.09510421752929688, + "InferNonlocalTensors": 0.16463732719421387, + "InferPSumTensor": 0.09516620635986328, + "InferShardAxis": 0.5436458587646484, + "InferSharedMemLoc": 0.013478994369506836, + "InlineNativeKernels": 0.0027844905853271484, + "InsertCoreBarrier": 0.008362293243408203, + "InsertIOTransposes": 0.07836699485778809, + "InsertImplicitShardAxisBeforeISel": 0.008057355880737305, + "InsertLocalTransposes": 0.01099085807800293, + "InsertOffloadedTransposes": 0.03647184371948242, + "LICM": 0.005979299545288086, + "LateLegalizeInst": 0.012919187545776367, + "LateLegalizePostSplit": 0.007997751235961914, + "LateLowerReshapeOp": 0.011852502822875977, + "LateLowerTensorOp": 0.007149696350097656, + "LateNeuronInstComb": 0.053853750228881836, + "LayoutPreprocessing": 0.07254910469055176, + "LayoutPreprocessingAndAnalysis": 0.13735532760620117, + "LayoutRequirementAnalysis": 0.012064695358276367, + "LegalizeCCOpLayout": 0.003309011459350586, + "LegalizeOpLevelAlias": 0.004944324493408203, + "LegalizePartitionReduce": 0.002275705337524414, + "LegalizeSundaAccess": 0.13529706001281738, + "LegalizeSundaMacro": 0.017252445220947266, + "LegalizeType": 0.007556915283203125, + "LocalLayoutOpt": 0.04438447952270508, + "LoopFusion": 0.018953561782836914, + "LoopSplitting": 0.0016851425170898438, + "LowerBroadcast": 0.005589485168457031, + "LowerCCOpBlockAxis": 0.009353399276733398, + "LowerComplexBroadcast": 0.011426210403442383, + "LowerIntrinsics": 0.04210019111633301, + "LowerShardAxis": 0.014751195907592773, + "LowerTensorOp": 0.02877187728881836, + "LowerToSendRecv": 0.006161689758300781, + "LowerTranspose": 0.02186894416809082, + "MacroGeneration": 0.1734302043914795, + "MaskPropagation": 0.014665842056274414, + "MemcpyElimination": 0.3008904457092285, + "MutateDataType": 0.0027010440826416016, + "NeuronAliasDependencyInduction": 0.0006909370422363281, + "NeuronAliasDependencyReset": 0.022809267044067383, + "NeuronInstComb": 0.005879402160644531, + "NeuronLICM": 0.0464015007019043, + "NeuronLoopFusion": 0.05638718605041504, + "NeuronLoopInterchange": 0.00871729850769043, + "NeuronSimplifier": 0.02101302146911621, + "NeuronSimplifyPredicates": 0.004530191421508789, + "NeuronValueNumbering": 0.007061004638671875, + "OptimizeAliasedCopyChain": 0.001558065414428711, + "OptimizeNKIKernels": 0.3715829849243164, + "PAGLayoutOpt": 0.648719310760498, + "PComputeCutting": 0.02423238754272461, + "PGLayoutTilingPipeline": 2.515984058380127, + "PGTiling": 0.46158504486083984, + "PadElimination": 0.0023555755615234375, + "ParAxesAnnotation": 0.5548486709594727, + "PartialLoopFusion": 0.04628252983093262, + "PartialSimdFusion": 0.06029558181762695, + "PerfectLoopNest": 0.0032892227172851563, + "RecognizeOpIdiom": 0.01747274398803711, + "Recompute": 0.00046896934509277344, + "RelaxPredicates": 0.00874948501586914, + "Rematerialization": 0.023741722106933594, + "RemoveShardedPartitionAxes": 0.041913747787475586, + "ReshapeWeights": 0.0023987293243408203, + "ResolveAccessConflict": 0.013326883316040039, + "ResolveComplicatePredicates": 0.0010704994201660156, + "RewriteReplicationMatmul": 0.00213623046875, + "RewriteWeights": 0.006081342697143555, + "SFKVectorizer": 0.5432095527648926, + "ShardingPropagationAnalysis": 0.04027843475341797, + "SimpleAllReduceTiling": 0.005087375640869141, + "Simplifier": 0.008136272430419922, + "SimplifyMacroPredicates": 0.010492086410522461, + "SimplifyNeuronTensor": 0.033696889877319336, + "SimplifySlice": 0.0016849040985107422, + "SimplifyTensor": 0.013016223907470703, + "SpillPSum": 0.04322075843811035, + "SplitAPUnionSets": 0.04480147361755371, + "SplitAccGrp": 0.0033092498779296875, + "StaticProfiler": 0.02093505859375, + "StaticTransposeLocalTensor": 0.011444330215454102, + "SundaISel": 0.0645599365234375, + "TCTransform": 0.0017342567443847656, + "TensorInitialization": 0.014005661010742188, + "TensorOpSimplifier": 0.010408163070678711, + "TensorOpTransform": 0.062005043029785156, + "TileCCOps": 0.007296085357666016, + "TilingProfiler": 0.04326295852661133, + "TransformConvOp": 0.004875659942626953, + "TritiumFusion": 0.12003302574157715, + "ValueNumbering": 0.007851839065551758, + "VectorizeDMA": 0.008031368255615234, + "VectorizeMatMult": 0.030368566513061523, + "WeightCoalescing": 0.009224891662597656, + "ZeroSizeTensorElimination": 0.0001709461212158203 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 3453.0, + "StaticProfiler::AifUb": 66.1578598022461, + "StaticProfiler::ArithmeticIntensityTensorizer": 256.2751770019531, + "StaticProfiler::AverageDmaLength": 1973.780029296875, + "StaticProfiler::AverageFractalPeUtilization": 99.81855773925781, + "StaticProfiler::AveragePartitionUtilization": 99.43334197998047, + "StaticProfiler::AveragePeUtilization": 99.31205749511719, + "StaticProfiler::DDRTransferBytes": 122882568.0, + "StaticProfiler::InternalTransferBytes": 87572480.0, + "StaticProfiler::LoadExpanded": 18965.0, + "StaticProfiler::LocalizationEfficiency": 387.36920166015625, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 656.1036376953125, + "StaticProfiler::StoreExpanded": 17921.0, + "StaticProfiler::TotalDMAExpanded": 36886.0, + "StaticProfiler::TotalDynamicInstancesCount": 4675.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 4662.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 192.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 1552.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 896.0, + "TilingProfiler::PfTransposeInstructionsForIo": 256.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 256.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 384.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 468.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.10222506523132324, + "AffinePredicateResolution": 0.002437591552734375, + "AliasDependencyElimination": 0.00020074844360351563, + "AliasDependencyInduction": 0.030005455017089844, + "AliasDependencyReset": 0.08542060852050781, + "BFComputeCutting": 0.009021759033203125, + "BirCodeGenLoop": 0.0576014518737793, + "CCOpFusion": 0.07059645652770996, + "CanonicalizeDAGForPGTiling": 0.011131525039672852, + "CanonicalizeIR": 0.0030748844146728516, + "CoalesceCCOp": 0.016925573348999023, + "CommuteConcat": 0.004233837127685547, + "DMALocalityOpt": 0.0022597312927246094, + "DMAProfiler": 0.011726617813110352, + "DMATilingProfiler": 0.010080099105834961, + "DataLocalityOpt": 0.45432257652282715, + "DataStreaming": 0.007066249847412109, + "DeConcat": 0.010270833969116211, + "DeadCodeElimination": 0.003401517868041992, + "DeadStoreElimination": 0.08969426155090332, + "DelinearIndices": 0.020795345306396484, + "Delinearization": 0.006405353546142578, + "DelinearizeSPMD": 0.031574249267578125, + "DoNothing": 0.00010728836059570313, + "DramToDramTranspose": 0.021518468856811523, + "DumpGraphAndMetadata": 0.00677490234375, + "EliminateDivs": 0.0029458999633789063, + "ExpandBatchNorm": 0.003565549850463867, + "ExpandISAMacro": 0.006104230880737305, + "FactorizeBlkDims": 0.03833317756652832, + "FactorizeThreadAxesInFreeDims": 0.007614850997924805, + "FlattenMacroLoop": 0.01127004623413086, + "GenericAccessSimplifier": 0.0043070316314697266, + "InferInitValue": 0.06825661659240723, + "InferIntrinsicOnCC": 0.046250104904174805, + "InferNeuronTensor": 0.09652161598205566, + "InferNonlocalTensors": 0.08535599708557129, + "InferPSumTensor": 0.08618307113647461, + "InferShardAxis": 0.6054186820983887, + "InferSharedMemLoc": 0.007490873336791992, + "InlineNativeKernels": 0.0046694278717041016, + "InsertCoreBarrier": 0.00831913948059082, + "InsertIOTransposes": 0.07386589050292969, + "InsertImplicitShardAxisBeforeISel": 0.012522697448730469, + "InsertLocalTransposes": 0.018398761749267578, + "InsertOffloadedTransposes": 0.03478860855102539, + "LICM": 0.006189107894897461, + "LateLegalizeInst": 0.018419742584228516, + "LateLegalizePostSplit": 0.011380195617675781, + "LateLowerReshapeOp": 0.006206035614013672, + "LateLowerTensorOp": 0.006627559661865234, + "LateNeuronInstComb": 0.013695240020751953, + "LayoutPreprocessing": 0.08205723762512207, + "LayoutPreprocessingAndAnalysis": 0.3778700828552246, + "LayoutRequirementAnalysis": 0.027397871017456055, + "LegalizeCCOpLayout": 0.004743337631225586, + "LegalizeOpLevelAlias": 0.001989126205444336, + "LegalizePartitionReduce": 0.003030061721801758, + "LegalizeSundaAccess": 0.026180505752563477, + "LegalizeSundaMacro": 0.02354145050048828, + "LegalizeType": 0.012012004852294922, + "LocalLayoutOpt": 0.09747910499572754, + "LoopFusion": 0.011905670166015625, + "LoopSplitting": 0.005662441253662109, + "LowerBroadcast": 0.0031082630157470703, + "LowerCCOpBlockAxis": 0.015021800994873047, + "LowerComplexBroadcast": 0.004594564437866211, + "LowerIntrinsics": 0.061724185943603516, + "LowerShardAxis": 0.01390695571899414, + "LowerTensorOp": 0.032297372817993164, + "LowerToSendRecv": 0.005787849426269531, + "LowerTranspose": 0.014832496643066406, + "MacroGeneration": 0.17066407203674316, + "MaskPropagation": 0.004767894744873047, + "MemcpyElimination": 0.3223605155944824, + "MutateDataType": 0.0023605823516845703, + "NeuronAliasDependencyInduction": 0.0017361640930175781, + "NeuronAliasDependencyReset": 0.02784562110900879, + "NeuronInstComb": 0.008632659912109375, + "NeuronLICM": 0.01805901527404785, + "NeuronLoopFusion": 0.041216135025024414, + "NeuronLoopInterchange": 0.0041141510009765625, + "NeuronSimplifier": 0.025291919708251953, + "NeuronSimplifyPredicates": 0.007104635238647461, + "NeuronValueNumbering": 0.0058324337005615234, + "OptimizeAliasedCopyChain": 0.0016317367553710938, + "OptimizeNKIKernels": 0.4839596748352051, + "PAGLayoutOpt": 0.3772914409637451, + "PComputeCutting": 0.03927016258239746, + "PGLayoutTilingPipeline": 2.7096974849700928, + "PGTiling": 0.5330896377563477, + "PadElimination": 0.0010271072387695313, + "ParAxesAnnotation": 0.32303333282470703, + "PartialLoopFusion": 0.05098128318786621, + "PartialSimdFusion": 0.10409116744995117, + "PerfectLoopNest": 0.008025884628295898, + "RecognizeOpIdiom": 0.014155864715576172, + "Recompute": 0.0006039142608642578, + "RelaxPredicates": 0.007999897003173828, + "Rematerialization": 0.0150146484375, + "RemoveShardedPartitionAxes": 0.04702639579772949, + "ReshapeWeights": 0.0015103816986083984, + "ResolveAccessConflict": 0.0074825286865234375, + "ResolveComplicatePredicates": 0.002012014389038086, + "RewriteReplicationMatmul": 0.002730846405029297, + "RewriteWeights": 0.01182103157043457, + "SFKVectorizer": 0.4407639503479004, + "ShardingPropagationAnalysis": 0.029230833053588867, + "SimpleAllReduceTiling": 0.005069255828857422, + "Simplifier": 0.020698070526123047, + "SimplifyMacroPredicates": 0.021116018295288086, + "SimplifyNeuronTensor": 0.012060403823852539, + "SimplifySlice": 0.0015597343444824219, + "SimplifyTensor": 0.014514684677124023, + "SpillPSum": 0.048569679260253906, + "SplitAPUnionSets": 0.05286097526550293, + "SplitAccGrp": 0.002934694290161133, + "StaticProfiler": 0.013947248458862305, + "StaticTransposeLocalTensor": 0.00755763053894043, + "SundaISel": 0.06808805465698242, + "TCTransform": 0.0025751590728759766, + "TensorInitialization": 0.005185127258300781, + "TensorOpSimplifier": 0.024057626724243164, + "TensorOpTransform": 0.06213688850402832, + "TileCCOps": 0.025543689727783203, + "TilingProfiler": 0.02153778076171875, + "TransformConvOp": 0.007241010665893555, + "TritiumFusion": 0.1687297821044922, + "ValueNumbering": 0.009909868240356445, + "VectorizeDMA": 0.008072137832641602, + "VectorizeMatMult": 0.042955636978149414, + "WeightCoalescing": 0.003875732421875, + "ZeroSizeTensorElimination": 0.00020575523376464844 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 8283.0, + "StaticProfiler::AifUb": 502.6534729003906, + "StaticProfiler::ArithmeticIntensityTensorizer": 413.67962646484375, + "StaticProfiler::AverageDmaLength": 2481.933349609375, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.62867736816406, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 266536960.0, + "StaticProfiler::InternalTransferBytes": 79167488.0, + "StaticProfiler::LoadExpanded": 71809.0, + "StaticProfiler::LocalizationEfficiency": 82.29916381835938, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 103.46524047851563, + "StaticProfiler::StoreExpanded": 18433.0, + "StaticProfiler::TotalDMAExpanded": 90242.0, + "StaticProfiler::TotalDynamicInstancesCount": 9699.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 9699.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 128.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 6144.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 992.0, + "TilingProfiler::PfTransposeInstructionsForIo": 288.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 192.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 512.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 547.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.04760026931762695, + "AffinePredicateResolution": 0.003319978713989258, + "AliasDependencyElimination": 0.0002167224884033203, + "AliasDependencyInduction": 0.008548259735107422, + "AliasDependencyReset": 0.03149843215942383, + "BFComputeCutting": 0.00810694694519043, + "BirCodeGenLoop": 0.2911098003387451, + "CCOpFusion": 0.08548593521118164, + "CanonicalizeDAGForPGTiling": 0.007600545883178711, + "CanonicalizeIR": 0.0030400753021240234, + "CoalesceCCOp": 0.008062601089477539, + "CommuteConcat": 0.007961034774780273, + "DMALocalityOpt": 0.002327442169189453, + "DMAProfiler": 0.009556293487548828, + "DMATilingProfiler": 0.009016752243041992, + "DataLocalityOpt": 0.17029356956481934, + "DataStreaming": 0.007345914840698242, + "DeConcat": 0.01120138168334961, + "DeadCodeElimination": 0.010882377624511719, + "DeadStoreElimination": 0.010195016860961914, + "DelinearIndices": 0.010077953338623047, + "Delinearization": 0.011870861053466797, + "DelinearizeSPMD": 0.035944223403930664, + "DoNothing": 0.0001087188720703125, + "DramToDramTranspose": 0.013046979904174805, + "DumpGraphAndMetadata": 0.03416705131530762, + "EliminateDivs": 0.004259586334228516, + "ExpandBatchNorm": 0.0017371177673339844, + "ExpandISAMacro": 0.0058269500732421875, + "FactorizeBlkDims": 0.03687334060668945, + "FactorizeThreadAxesInFreeDims": 0.00911855697631836, + "FlattenMacroLoop": 0.0048520565032958984, + "GenericAccessSimplifier": 0.001367330551147461, + "InferInitValue": 0.0836641788482666, + "InferIntrinsicOnCC": 0.008740901947021484, + "InferNeuronTensor": 0.05709338188171387, + "InferNonlocalTensors": 0.041548728942871094, + "InferPSumTensor": 0.05230545997619629, + "InferShardAxis": 0.5781030654907227, + "InferSharedMemLoc": 0.026081323623657227, + "InlineNativeKernels": 0.002477407455444336, + "InsertCoreBarrier": 0.008142948150634766, + "InsertIOTransposes": 0.039937734603881836, + "InsertImplicitShardAxisBeforeISel": 0.013466596603393555, + "InsertLocalTransposes": 0.018125534057617188, + "InsertOffloadedTransposes": 0.014874696731567383, + "LICM": 0.0058231353759765625, + "LateLegalizeInst": 0.01174783706665039, + "LateLegalizePostSplit": 0.02429652214050293, + "LateLowerReshapeOp": 0.0018832683563232422, + "LateLowerTensorOp": 0.0021920204162597656, + "LateNeuronInstComb": 0.043119192123413086, + "LayoutPreprocessing": 0.06973385810852051, + "LayoutPreprocessingAndAnalysis": 0.11140203475952148, + "LayoutRequirementAnalysis": 0.013022661209106445, + "LegalizeCCOpLayout": 0.0020427703857421875, + "LegalizeOpLevelAlias": 0.0016918182373046875, + "LegalizePartitionReduce": 0.0030241012573242188, + "LegalizeSundaAccess": 0.045601606369018555, + "LegalizeSundaMacro": 0.02708148956298828, + "LegalizeType": 0.014174222946166992, + "LocalLayoutOpt": 0.022045135498046875, + "LoopFusion": 0.029404163360595703, + "LoopSplitting": 0.0007355213165283203, + "LowerBroadcast": 0.005047321319580078, + "LowerCCOpBlockAxis": 0.007714748382568359, + "LowerComplexBroadcast": 0.005654096603393555, + "LowerIntrinsics": 0.04253792762756348, + "LowerShardAxis": 0.03305673599243164, + "LowerTensorOp": 0.028458356857299805, + "LowerToSendRecv": 0.03391242027282715, + "LowerTranspose": 0.04655814170837402, + "MacroGeneration": 0.06428074836730957, + "MaskPropagation": 0.0036263465881347656, + "MemcpyElimination": 0.05451250076293945, + "MutateDataType": 0.001516103744506836, + "NeuronAliasDependencyInduction": 0.0005834102630615234, + "NeuronAliasDependencyReset": 0.022034168243408203, + "NeuronInstComb": 0.04628133773803711, + "NeuronLICM": 0.026567935943603516, + "NeuronLoopFusion": 0.07339620590209961, + "NeuronLoopInterchange": 0.0027348995208740234, + "NeuronSimplifier": 0.021918296813964844, + "NeuronSimplifyPredicates": 0.014072179794311523, + "NeuronValueNumbering": 0.013863325119018555, + "OptimizeAliasedCopyChain": 0.0008976459503173828, + "OptimizeNKIKernels": 4.611967086791992, + "PAGLayoutOpt": 0.2917053699493408, + "PComputeCutting": 0.008776664733886719, + "PGLayoutTilingPipeline": 1.8517823219299316, + "PGTiling": 0.26313185691833496, + "PadElimination": 0.0006458759307861328, + "ParAxesAnnotation": 0.188338041305542, + "PartialLoopFusion": 0.05682229995727539, + "PartialSimdFusion": 0.0237729549407959, + "PerfectLoopNest": 0.00557398796081543, + "RecognizeOpIdiom": 0.008669376373291016, + "Recompute": 0.0005908012390136719, + "RelaxPredicates": 0.006473541259765625, + "Rematerialization": 0.011237144470214844, + "RemoveShardedPartitionAxes": 0.014671802520751953, + "ReshapeWeights": 0.0018546581268310547, + "ResolveAccessConflict": 0.008959770202636719, + "ResolveComplicatePredicates": 0.0009264945983886719, + "RewriteReplicationMatmul": 0.0037200450897216797, + "RewriteWeights": 0.008005380630493164, + "SFKVectorizer": 0.1923050880432129, + "ShardingPropagationAnalysis": 0.10689902305603027, + "SimpleAllReduceTiling": 0.003542184829711914, + "Simplifier": 0.00808858871459961, + "SimplifyMacroPredicates": 0.031823158264160156, + "SimplifyNeuronTensor": 0.013367414474487305, + "SimplifySlice": 0.001531362533569336, + "SimplifyTensor": 0.018309593200683594, + "SpillPSum": 0.03448653221130371, + "SplitAPUnionSets": 0.09693408012390137, + "SplitAccGrp": 0.0025701522827148438, + "StaticProfiler": 0.04053521156311035, + "StaticTransposeLocalTensor": 0.012635231018066406, + "SundaISel": 0.10333561897277832, + "TCTransform": 0.006776332855224609, + "TensorInitialization": 0.011014938354492188, + "TensorOpSimplifier": 0.005452632904052734, + "TensorOpTransform": 0.033481597900390625, + "TileCCOps": 0.011636972427368164, + "TilingProfiler": 0.024947643280029297, + "TransformConvOp": 0.013001441955566406, + "TritiumFusion": 0.1458723545074463, + "ValueNumbering": 0.003311634063720703, + "VectorizeDMA": 0.005986928939819336, + "VectorizeMatMult": 0.028806686401367188, + "WeightCoalescing": 0.007086515426635742, + "ZeroSizeTensorElimination": 0.00017833709716796875 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 25519.0, + "StaticProfiler::AifUb": 337.1839904785156, + "StaticProfiler::ArithmeticIntensityTensorizer": 248.63792419433594, + "StaticProfiler::AverageDmaLength": 2413.602294921875, + "StaticProfiler::AverageFractalPeUtilization": 98.93502807617188, + "StaticProfiler::AveragePartitionUtilization": 95.0970230102539, + "StaticProfiler::AveragePeUtilization": 97.18069458007813, + "StaticProfiler::DDRTransferBytes": 495991840.0, + "StaticProfiler::InternalTransferBytes": 361682720.0, + "StaticProfiler::LoadExpanded": 133728.0, + "StaticProfiler::LocalizationEfficiency": 73.73954010009766, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 79.92718505859375, + "StaticProfiler::StoreExpanded": 7530.0, + "StaticProfiler::TotalDMAExpanded": 141258.0, + "StaticProfiler::TotalDynamicInstancesCount": 30781.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 30330.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 14112.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 10273.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 768.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 10.0, + "TilingProfiler::SimdInstructionsAfterTiling": 311.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 1.8000000636675395e-05, + "CanonicalizeForTensorizer": 1.8000000636675395e-05, + "Canonicalizer": 0.0003330000035930425, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 1.5999999959603883e-05, + "MemcastMotion": 7.000000096013537e-06, + "PenguinizeFunctions": 1.8000000636675395e-05, + "PruneFunctions": 1.8000000636675395e-05, + "RemoveOptimizationBarriers": 2.4000000848900527e-05, + "ScatterMotion": 7.000000096013537e-06, + "TensorizerLegalizationPass": 2.300000051036477e-05, + "VerifySupportedOps": 1.5999999959603883e-05, + "algsimp": 9.899999713525176e-05, + "batchnorm_expander": 1.5999999959603883e-05, + "boundary-marker-removal": 7.000000096013537e-06, + "call-inliner": 1.4000000192027073e-05, + "canonicalize-boundary-marker": 7.999999979801942e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 7.999999979801942e-06, + "computation-deduplicator": 2.700000004551839e-05, + "config-lowering": 4.999999873689376e-05, + "constant_folding": 1.4000000192027073e-05, + "cse": 1.8000000636675395e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 6.000000212225132e-06, + "eliminate-redundant-compare": 4.999999873689376e-06, + "emit-offloaded-dropout": 1.5999999959603883e-05, + "flatten-call-graph": 1.1000000085914508e-05, + "fuse-send-recv": 2.9000000722589903e-05, + "hilo-conditional-to-select": 9.000000318337698e-06, + "hilo::LegalizeAlias": 6.000000212225132e-06, + "hilo::NeuronInstCombine": 5.400000009103678e-05, + "hilo::NeuronOpFusion": 1.2000000424450263e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 1.8000000636675395e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 1.5999999959603883e-05, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 0.00012700000661425292, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 4.999999873689376e-06, + "lower-argminmax-custom-call": 4.999999873689376e-06, + "map-inline": 1.4000000192027073e-05, + "metadata-naming": 2.9000000722589903e-05, + "mlir::detail::OpToOpPassAdaptor": 2.499999936844688e-05, + "mlir::hlo::MhloToPyPenguin": 0.0017209999496117234, + "mlir::mhlo::LowerComplexExtraPass": 7.200000254670158e-05, + "mlir::mhlo::LowerComplexPass": 0.00014099999680183828, + "native-to-custom-softmax": 7.000000096013537e-06, + "native-to-custom-softmax-dx": 2.300000051036477e-05, + "neuron-hlo-verifier": 0.0005729999975301325, + "operand_upcaster": 1.8999999156221747e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0007699999841861427, + "replace-minimum-constant": 9.000000318337698e-06, + "reshape-mover": 3.999999989900971e-06, + "simplify-concat": 4.8999998398358e-05, + "simplify-while-loops": 3.000000106112566e-06, + "transform-variadic-reduce": 1.1000000085914508e-05, + "tuple-simplifier": 7.000000096013537e-06, + "unpack-nested-aws-ntwsr": 4.999999873689376e-06, + "unroll-while-loop": 2.099999983329326e-05 + }, + "hilo": { + "ArithmeticIntensity": 661.1749267578125, + "HloMacCount": 55834574848.0, + "Traffic": 168895008.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 7.000000096013537e-06, + "CanonicalizeForTensorizer": 1.9999999494757503e-05, + "Canonicalizer": 0.0004039999912492931, + "HoistCompute": 0.0, + "IdentifyCrossPassTensors": 1.9999999494757503e-05, + "MemcastMotion": 0.0, + "PenguinizeFunctions": 1.8000000636675395e-05, + "PruneFunctions": 9.999999747378752e-06, + "RemoveOptimizationBarriers": 2.499999936844688e-05, + "ScatterMotion": 1.9999999949504854e-06, + "TensorizerLegalizationPass": 9.000000318337698e-06, + "VerifySupportedOps": 1.8000000636675395e-05, + "algsimp": 0.00012599999899975955, + "batchnorm_expander": 1.700000029813964e-05, + "boundary-marker-removal": 6.000000212225132e-06, + "call-inliner": 1.8999999156221747e-05, + "canonicalize-boundary-marker": 7.000000096013537e-06, + "collective-stream-id-checker": 3.999999989900971e-06, + "comparison-expander": 2.9999999242136255e-05, + "computation-deduplicator": 3.199999991920777e-05, + "config-lowering": 6.500000017695129e-05, + "constant_folding": 1.700000029813964e-05, + "cse": 2.499999936844688e-05, + "dce": 1.9999999949504854e-06, + "dynamic-slice-transpose": 7.000000096013537e-06, + "eliminate-redundant-compare": 4.999999873689376e-06, + "emit-offloaded-dropout": 1.9999999494757503e-05, + "flatten-call-graph": 1.700000029813964e-05, + "fuse-send-recv": 3.899999865097925e-05, + "hilo-conditional-to-select": 1.2000000424450263e-05, + "hilo::LegalizeAlias": 3.000000106112566e-06, + "hilo::NeuronInstCombine": 1.4000000192027073e-05, + "hilo::NeuronOpFusion": 1.8000000636675395e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 2.099999983329326e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 6.199999916134402e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.007534000091254711, + "legalize-ccops-for-tensorizer": 1.9999999949504854e-06, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 6.000000212225132e-06, + "map-inline": 1.5999999959603883e-05, + "metadata-naming": 2.499999936844688e-05, + "mlir::detail::OpToOpPassAdaptor": 1.2999999853491317e-05, + "mlir::hlo::MhloToPyPenguin": 0.008725999854505062, + "mlir::mhlo::LowerComplexExtraPass": 7.899999764049426e-05, + "mlir::mhlo::LowerComplexPass": 0.0001320000010309741, + "native-to-custom-softmax": 6.000000212225132e-06, + "native-to-custom-softmax-dx": 2.5999999706982635e-05, + "neuron-hlo-verifier": 0.0005200000014156103, + "operand_upcaster": 1.700000029813964e-05, + "post-par-pipe-begin": 1.9999999949504854e-06, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.00076299998909235, + "replace-minimum-constant": 1.4999999621068127e-05, + "reshape-mover": 4.999999873689376e-06, + "simplify-concat": 5.199999941396527e-05, + "simplify-while-loops": 3.999999989900971e-06, + "transform-variadic-reduce": 7.100000220816582e-05, + "tuple-simplifier": 7.999999979801942e-06, + "unpack-nested-aws-ntwsr": 6.000000212225132e-06, + "unroll-while-loop": 9.999999974752427e-07 + }, + "hilo": { + "ArithmeticIntensity": 207.31654357910156, + "HloMacCount": 38811336704.0, + "Traffic": 374416192.0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.012049198150634766, + "DMALocalityOpt": 0.013970613479614258, + "DMAProfiler": 0.007668733596801758, + "DataStreaming": 0.031991004943847656, + "DoNothing": 0.0002658367156982422, + "ExpandISAMacro": 0.007843732833862305, + "FactorizeBlkDims": 0.03324699401855469, + "InferPSumTensor": 0.17986130714416504, + "InferSharedMemLoc": 0.0050508975982666016, + "InsertCoreBarrier": 0.0073986053466796875, + "LateLegalizeInst": 0.02333354949951172, + "LateNeuronInstComb": 0.01965785026550293, + "LegalizeSundaAccess": 0.0358271598815918, + "LegalizeType": 0.026246309280395508, + "LowerBroadcast": 0.023288965225219727, + "LowerIntrinsics": 0.008131980895996094, + "LowerTranspose": 0.004733085632324219, + "NeuronInstComb": 0.01128530502319336, + "NeuronLICM": 0.027612686157226563, + "NeuronSimplifyPredicates": 0.006512641906738281, + "NeuronValueNumbering": 0.007419109344482422, + "SFKVectorizer": 0.09510302543640137, + "SimpleAllReduceTiling": 0.0070476531982421875, + "SimplifyNeuronTensor": 0.11079812049865723, + "SpillPSum": 0.058808088302612305, + "WeightCoalescing": 0.0070688724517822266 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk4/graph.neff b/context_encoding_model/_tp0_bk4/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..0c6ac4143757fd762b2f565262967b22ebee8d1a --- /dev/null +++ b/context_encoding_model/_tp0_bk4/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bfd67384e1a0c5645609060b8bfb6fc5cfe3dbbd75b7568508606e623f387d +size 1926144 diff --git a/context_encoding_model/_tp0_bk4/log-neuron-cc.txt b/context_encoding_model/_tp0_bk4/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..c80f1d9aada0a31fe3a0722a6038cf7476c97358 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/log-neuron-cc.txt @@ -0,0 +1,9559 @@ +2025-11-04T21:38:33Z INFO 8685 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:33Z INFO 8685 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:33Z INFO 8698 [root]: XLA detected +2025-11-04T21:38:33Z INFO 8698 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:34Z INFO 8698 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4 +2025-11-04T21:38:34Z INFO 8698 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:34Z INFO 8698 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:34Z INFO 8698 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:34Z INFO 8698 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 8312 + reshape 1912 23.00% ################################################################ + broadcast 1123 13.51% ##################################### + transpose 1072 12.90% ################################### + convert 945 11.37% ############################### + constant 636 7.65% ##################### + parameter 371 4.46% ############ + slice 347 4.17% ########### + add 284 3.42% ######### + get-tuple-element 259 3.12% ######## + multiply 255 3.07% ######## + dot 198 2.38% ###### + call 174 2.09% ##### + compare 173 2.08% ##### + select 170 2.05% ##### + concatenate 116 1.40% ### + tuple 57 0.69% # + scatter 57 0.69% # + negate 56 0.67% # + all-reduce 56 0.67% # + divide 29 0.35% + gather 6 0.07% + iota 5 0.06% + all-gather 3 0.04% + reduce 3 0.04% + custom-call 2 0.02% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 5437 + reshape 1421 26.14% ################################################################ + transpose 817 15.03% #################################### + convert 720 13.24% ################################ + constant 443 8.15% ################### + parameter 371 6.82% ################ + broadcast 266 4.89% ########### + dot 197 3.62% ######## + custom-call 175 3.22% ####### + multiply 171 3.15% ####### + add 171 3.15% ####### + get-tuple-element 147 2.70% ###### + slice 115 2.12% ##### + concatenate 114 2.10% ##### + compare 59 1.09% ## + select 58 1.07% ## + scatter 57 1.05% ## + negate 56 1.03% ## + all-reduce 56 1.03% ## + gather 6 0.11% + all-gather 3 0.06% + iota 3 0.06% + reduce 3 0.06% + pad 2 0.04% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +Potential split-points stats: #CC 59 #AR 56 #AG 3 #BN 0 nClamp 0 +ModuleSplitter initial partitioning... #parts 59 +ModuleSplitter initial partitioning... Done. + 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 57 58 +New disjoint wave: start 2 len 54 NumReps: 27 macs 1507533520896 +First non-zero-mac/used part from the end is 58 +Not enough zero-mac parts. skip +ModuleSplitter initial partitioning... #parts 29 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: IR signature: 66a95a93f4019d420bf017fa5e43303ad25f3e0a31011f48fad97ade9028ee76 for sg0000/HLOToTensorizer +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: IR signature: bdabb093663dc2324f935e932f22345ab4111086fe33706a3c2e0f7ba61b67a0 for sg0001/HLOToTensorizer +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: IR signature: b450695497ff1fc37081039a148fbf215cafeb494d658b63147a29e8e8488685 for sg0002/HLOToTensorizer +2025-11-04T21:38:34Z INFO 8698 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:34Z INFO 8698 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:34Z INFO 8698 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:34Z INFO 8698 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:34Z INFO 8698 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:34Z INFO 8698 [job.Frontend.0]: Start model loading +2025-11-04T21:38:34Z INFO 8698 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:34Z INFO 8698 [job.Frontend.0]: Num jobs: 12 +2025-11-04T21:38:34Z USER 8698 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:34Z INFO 8698 [Tensorizer]: Max workers: 3 +2025-11-04T21:38:34Z INFO 8739 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-11-04T21:38:34Z INFO 8738 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-11-04T21:38:34Z INFO 8740 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-11-04T21:38:34Z INFO 8738 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:34Z INFO 8738 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:34Z INFO 8739 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:34Z INFO 8739 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:34Z INFO 8739 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:34Z INFO 8738 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.002 seconds +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:34Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.005 seconds +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:34Z INFO 8738 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.029 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.032 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.011 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.065 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.016 seconds +2025-11-04T21:38:35Z INFO 8740 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.077 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.013 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.028 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.024 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.022 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.046 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.017 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.013 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:35Z INFO 8738 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.029 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.033 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.054 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.031 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.062 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.049 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.007 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.055 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:35Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:35Z INFO 8739 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.030 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.066 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.085 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.032 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.024 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.043 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.029 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.089 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.005 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.095 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.309 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: LICM finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.062 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.322 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.025 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.073 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.033 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.102 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.015 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.040 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.017 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.040 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.025 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.027 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.013 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: LICM finished after 0.017 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.018 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.029 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.283 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.017 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.201 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.301 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.003 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.034 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.016 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:36Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.019 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.009 seconds +2025-11-04T21:38:36Z INFO 8740 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.017 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.010 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8740 [Tensorizer]: After optimization: 39 statements +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.015 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-162 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6459 | hlo_id: 108 | , id = 162 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-178 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6596 | hlo_id: 117 | , id = 178 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.012 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.012 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.022 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.010 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.011 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.022 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.019 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.099 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.024 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.070 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.111 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.090 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.008 seconds +2025-11-04T21:38:37Z INFO 8739 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.043 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.042 seconds +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.021 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.021 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:37Z INFO 8740 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.026 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.059 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.106 seconds +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.032 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.188 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.018 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.292 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.036 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.046 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.018 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.107 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.016 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.097 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.027 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.016 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.017 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.017 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 36 +total number of sharded dags: 13 + +total bytes transferred from input, output, non local tensors: 391205666 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 366017296 +% bytes transferred with 2x bandwidths: 93.56 + +NC0 FLOPs: 181850210 +NC1 FLOPs: 181842016 +% FLOPs sharded: 100.00 + + +Shard dim: 2048, Number of dags: 7 +Matmuls sharded with this dim: +[2048(s),2,6,2,128] @ [2,6,2,128,8,2,128] = [2048(s),8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[2048(s),2,8,128] @ [2,8,128,2,6,2,128] = [2048(s),2,6,2,128] Number of occurrences: 2 + + +Shard dim: 256, Number of dags: 5 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[2,8,128] @ [2,8,128,75968(s)] = [75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.082 seconds +2025-11-04T21:38:38Z INFO 8739 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.071 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8738 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/TileCCOps]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.578 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.029 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:38Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8740 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.027 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 600 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG85'), (25, 'AG82'), (19, 'AG84'), (24, 'AG83')] +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(26, 'AG77'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(3, 'AG94'), (23, 'AG93'), (21, 'AG79'), (22, 'AG78')] +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.378 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.048 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.085 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.064 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.263 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.044 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.040 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.024 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.015 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.013 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.852 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 96: simd128x512 +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 4: reduce512x1x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 4: simd1x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 4: reduce512x1x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.025 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.073 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.016 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.323 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.137 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.053 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.377 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.057 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.032 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.029 seconds +2025-11-04T21:38:39Z INFO 8739 [sg0001/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.165 seconds +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.016 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:39Z INFO 8738 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.170 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: simd128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: simd128x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 32: dma128x1024 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: dma128x2048 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: dma128x2048 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: reduce512x1x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: simd1x512 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 4: reduce512x1x1 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8740 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.027 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.021 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.056 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 32 +total number of sharded dags: 25 + +total bytes transferred from input, output, non local tensors: 119546884 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 85987328 +% bytes transferred with 2x bandwidths: 71.93 + +NC0 FLOPs: 36893488143169486851 +NC1 FLOPs: 36893488143169486848 +% FLOPs sharded: 100.00 + + +Shard dim: 2048, Number of dags: 24 +Matmuls sharded with this dim: +[2048(s),2,6,2,128] @ [2,6,2,128,8,2,128] = [2048(s),8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[2048(s),2,8,128] @ [2,8,128,2,2,2,2,64] = [2048(s),2,2,2,2,64] Number of occurrences: 1 +[2048(s),2,8,128] @ [2,8,128,2,6,2,128] = [2048(s),2,6,2,128] Number of occurrences: 2 +[2048(s),2,8,128] @ [2,8,128,4,128] = [2048(s),4,128] Number of occurrences: 1 +[2048(s),2,8,128] @ [2,8,128,4,2,64] = [2048(s),4,2,64] Number of occurrences: 1 + + +Shard dim: 2, Number of dags: 1 +Matmuls sharded with this dim: +[2048,4,2,128] @ [4,2,128,2(s),2,4,128] = [2048,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.021 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.021 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.047 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.605 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.555 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.032 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.649 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.020 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.037 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.084 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.022 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.040 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.018 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG119'), (30, 'AG116'), (25, 'AG118'), (28, 'AG117')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 659 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(31, 'AG111'), (27, 'AG113'), (29, 'AG112')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 660 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG113'), (31, 'AG111'), (29, 'AG112')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77|N|(64, 2) is not sorted, index list (w/ AG ids): [(13, 'AG123'), (9, 'AG124')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 4, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG113'), (31, 'AG111'), (29, 'AG112')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input75|N|(64, 2) is not sorted, index list (w/ AG ids): [(18, 'AG128'), (14, 'AG129')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 664 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 4, 128) is not sorted, index list (w/ AG ids): [(27, 'AG113'), (31, 'AG111'), (29, 'AG112')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(20, 'AG135'), (12, 'AG137'), (17, 'AG136')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 694 of IO tensor non_local bfloat16 %reshape.68(4, 2, 2, 64, 2, 1024) is not sorted, index list (w/ AG ids): [(10, 'AG130'), (15, 'AG131'), (7, 'AG115'), (26, 'AG114')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 644 of IO tensor non_local bfloat16 %reshape.73(4, 2, 2, 1024, 128) is not sorted, index list (w/ AG ids): [(11, 'AG133'), (16, 'AG134'), (7, 'AG115'), (19, 'AG132')] +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.102 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.103 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.022 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.039 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8739 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 32 +total number of sharded dags: 25 + +total bytes transferred from input, output, non local tensors: 68180998 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 59791360 +% bytes transferred with 2x bandwidths: 87.70 + +NC0 FLOPs: 36893488143150284803 +NC1 FLOPs: 36893488143150284800 +% FLOPs sharded: 100.00 + + +Shard dim: 2048, Number of dags: 23 +Matmuls sharded with this dim: +[2048(s),2,2,4,128] @ [2,2,4,128,2,2,2,2,64] = [2048(s),2,2,2,2,64] Number of occurrences: 1 +[2048(s),2,2,4,128] @ [2,2,4,128,4,128] = [2048(s),4,128] Number of occurrences: 1 +[2048(s),2,2,4,128] @ [2,2,4,128,4,2,64] = [2048(s),4,2,64] Number of occurrences: 1 +[64] @ [2048(s)] = [64,2048(s)] Number of occurrences: 1 + + +Shard dim: 2, Number of dags: 1 +Matmuls sharded with this dim: +[2048,4,2,128] @ [4,2,128,2(s),2,4,128] = [2048,2(s),2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + +Shard dim: 512, Number of dags: 1 +Matmuls sharded with this dim: + + + +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.073 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.023 seconds +2025-11-04T21:38:40Z INFO 8740 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.026 seconds +2025-11-04T21:38:40Z INFO 8738 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.037 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.042 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.544 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.171 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.059 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.533 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.073 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.015 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.004 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.014 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.074 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.018 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.046 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.035 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 2, 4) is not sorted, index list (w/ AG ids): [(30, 'AG95'), (24, 'AG98'), (21, 'AG97'), (27, 'AG96')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 2, 4, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(24, 'AG98'), (30, 'AG95'), (21, 'AG97'), (27, 'AG96')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 635 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|N|(64, 2) is not sorted, index list (w/ AG ids): [(25, 'AG101'), (22, 'AG104')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 2, 4, 4, 2, 64) is not sorted, index list (w/ AG ids): [(24, 'AG98'), (30, 'AG95'), (21, 'AG97'), (27, 'AG96')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|N|(64, 2) is not sorted, index list (w/ AG ids): [(25, 'AG101'), (18, 'AG108')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 2, 4, 4, 128) is not sorted, index list (w/ AG ids): [(24, 'AG98'), (30, 'AG95'), (21, 'AG97'), (27, 'AG96')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 419 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(28, 'AG114'), (23, 'AG116'), (26, 'AG115')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 631 of IO tensor non_local bfloat16 %all_gather.1(2, 2, 4, 128, 2, 1024) is not sorted, index list (w/ AG ids): [(21, 'AG97'), (24, 'AG98'), (27, 'AG96'), (1, 'AG100'), (29, 'AG99')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 520 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate0(2, 1024, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(1, 'AG100'), (29, 'AG99'), (24, 'AG98'), (21, 'AG97'), (27, 'AG96')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 582 of IO tensor non_local bfloat16 %reshape.16(2, 2, 2, 2, 64, 2, 1024) is not sorted, index list (w/ AG ids): [(6, 'AG107'), (13, 'AG106'), (17, 'AG105'), (22, 'AG104'), (25, 'AG101'), (1, 'AG100')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 676 of IO tensor non_local bfloat16 %reshape.24(4, 2, 2, 64, 2, 1024) is not sorted, index list (w/ AG ids): [(7, 'AG109'), (14, 'AG110'), (18, 'AG108'), (25, 'AG101'), (1, 'AG100')] +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 614 of IO tensor non_local bfloat16 %reshape.29(4, 2, 2, 1024, 128) is not sorted, index list (w/ AG ids): [(8, 'AG112'), (15, 'AG113'), (1, 'AG100'), (19, 'AG111')] +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.062 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.022 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.710 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 96: simd128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: generic_store128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 64: generic_store128x128 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.022 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.006 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.019 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.024 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.090 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.002 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.097 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.026 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.024 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.012 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.009 seconds +2025-11-04T21:38:41Z INFO 8739 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.173 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.462 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.146 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.047 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.078 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.029 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.036 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.056 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.057 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.035 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.516 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.030 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 64: indirect_load128x256 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 64: generic_store128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 64: generic_store128x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x256 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingBottleneck]: 32: simd128x512 +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.043 seconds +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.047 seconds +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:41Z INFO 8740 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.028 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.043 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.088 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.454 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1536: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: simd128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: simd128x512 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 64: generic_store128x128 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.095 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.019 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.024 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.024 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.018 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.034 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.050 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.051 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.043 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.041 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.027 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.025 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.021 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.027 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.259 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: indirect_load128x256 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: dma128x512 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: dma128x512 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: transpose_128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: generic_store128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: generic_store128x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: rmsnorm128x512x128 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 32: simd128x256 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.052 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.020 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.068 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.017 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.025 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.025 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.021 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.068 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.028 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:42Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.046 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.023 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.011 seconds +2025-11-04T21:38:42Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.007 seconds +2025-11-04T21:38:42Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.041 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.014 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.038 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.010 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.099 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.114 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.047 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.085 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.192 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.021 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 56.654% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'992.1591'[i31_0,4i31_1_0_0+i31_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i31_0,i0.128+512i31_1_0_0+128i31_1_0_1,i2.16,i1.128] # id=1590, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_992 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 9.105% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 6, 2, 2, 128, 2048) %1532[i11_0,i11_1_0,2i10_0_0_1_0+i10_0_0_1_1,i10_0_0_0,c2_1041,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input366'[i10_0_0_0,2i10_0_0_1_0+i10_0_0_1_1,i0.128,c2_1041,i1.2048] # id=1367, src_id=None, , instances=96 # dl = tensor_op_name: _dot.197 | hlo_id: 52 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 9.105% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 6, 2, 2, 128, 2048) %1530[i16_0_1076,i13_1_0,2i12_0_0_1_0+i12_0_0_1_1,i12_0_0_0,c2_1052,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input368'[i12_0_0_0,2i12_0_0_1_0+i12_0_0_1_1,i0.128,c2_1052,i1.2048] # id=1370, src_id=None, , instances=96 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 198.539us (24.000MiB, est bw: 126.755GB/s, 7.385% of tot. time) for bfloat16<128 x 512> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 6, 128, 2, 512) %'input365_local_1070'[i16_0_1076,i15_0_0_0_1,i15_0_0_0_0,c1_1062_2054,c2_1063_2054,i0.128,i3.2,i1.128+128i2.2+256p_1701_2054] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 6, 2, 2, 128) %'input365'[i15_0_0_0_1+2i15_0_0_0_0,p_1701_2054,c1_1062_2054,i0.128,c2_1063_2054,i3.2,i2.2,i1.128] # id=1376, src_id=None, , instances=192 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.128, i2.2, i3.2]] -> [[i0.128];[i1.128, i2.2, i3.2]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 193.732us (300.000KiB, est bw: 1.586GB/s, 7.207% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 2, 37984) %'convert.55'[0,i31_0,i0.128+512i31_1_0_0+128i31_1_0_1] = store float32<1 x 128> TongaSB partitions[2] float32 (2, 297, 1, 128) %'dot.200.1601'[i31_0,4i31_1_0_0+i31_1_0_1,0,i0.128] # id=1599, src_id=None, , instances=600 # dl = tensor_op_name: _dot.200 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 1.558% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %'996.1675'[i11_0,i11_1_0,T_i2_0_2052,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 2, 512, 2048) %'add.9'[i11_0,i11_1_0,i0.128+128T_i2_0_2052,i1.2048] # id=1565, src_id=None, , instances=16 # dl = tensor_op_name: add.9_pftranspose_996 | hlo_id: 27 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 1.558% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2, 2, 512) %'_reload_1526'[i16_0_1076,i13_1_0,i4_0_1_1529_2053_0,i0.128,i3.2,i2.2,i1.512] = load bfloat16<128 x 2048> DRAM3DBlk partitions[3] bfloat16 (4, 2, 2, 128, 2048) %'_spill_1523'[i4_0_1_1529_2053_0,i16_0_1076,i13_1_0,i0.128,i1.512+1024i2.2+512i3.2] # id=1528, src_id=None, , instances=16 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.512, i2.2, i3.2]] -> [[i0.128];[i1.512, i2.2, i3.2]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 1.558% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %'1000.1680'[T_i20_0_1008,T_i20_1_0_1008,T_i2_0_2055,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (4194304,) %'all_reduce.3-buffer-2076'[2097152T_i20_0_1008+2048i0.128+1048576T_i20_1_0_1008+i1.2048+262144T_i2_0_2055] # id=1574, src_id=None, , instances=16 # dl = tensor_op_name: all_reduce.3_pftranspose_1000 | hlo_id: 66 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 25.532us (8.000MiB, est bw: 328.547GB/s, 0.950% of tot. time) for bfloat16<128 x 2048> DRAM3DBlk partitions[3] bfloat16 (4, 2, 2, 128, 2048) %'_spill_1523'[i2_0_1_1639_2057_0,i11_0,i11_1_0,i0.128,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %1014[i11_0,i11_1_0,i2_0_1_1639_2057_0,i0.128,i1.2048] # id=1525, src_id=None, , instances=16 # dl = tensor_op_name: _custom-call.348 | hlo_id: 34 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 25.532us (8.000MiB, est bw: 328.547GB/s, 0.950% of tot. time) for bfloat16<128 x 2048> non_local bfloat16 (4194304,) %'dot.14-buffer-2074'[2097152i16_0_1076+2048i0.128+1048576i16_1_0_0_1076_1531+i1.2048+262144i16_1_0_1_1076_1531] = store bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %1077[i16_0_1076,i16_1_0_0_1076_1531,i16_1_0_1_1076_1531,i0.128,i1.2048] # id=1379, src_id=None, , instances=16 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.010 seconds +2025-11-04T21:38:43Z INFO 8740 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.065 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.023 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.011 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.037 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.044 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.056 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.006 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.010 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.071 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.053 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.013 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.061 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.014 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.103 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.014 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.104 seconds +2025-11-04T21:38:43Z INFO 8739 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.011 seconds +2025-11-04T21:38:43Z INFO 8740 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.089 seconds +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:43Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.108 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.042 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.016 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.169 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.015 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.036 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.016 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.026 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.038 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.059 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.060 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.043 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.050 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.051 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.012 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.133 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.015 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.012 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.014 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.120 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.050 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.049 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.014 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.037 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.038 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.030 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2630) %4(init=0.0)[i0.32,i1.2374] = load float32<32 x 2374> float32 (32, 2374) %6[i0.32,i1.2374] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2374) %10[i0.32,i1.2374] = load float32<32 x 2374> float32 (1, 75968) %'inp'[i0.32,i1.2374] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 9.509% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.045 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.013 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.046 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.062 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.012 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.061 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.018 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.022 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.046 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.014 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.039 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.042 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.086 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.054 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8738 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:44Z INFO 8739 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.033 seconds +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.026 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.029 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.043 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.014 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.045 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.011 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.012 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.042 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.023 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.046 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.020 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.052 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.042 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.059 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.095 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.009 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.026 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.028 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.166 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.025 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.135 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.009 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.441 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.014 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.018 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.175 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.009 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.180 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.034 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 17.910% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 6, 2, 2, 128, 2048) %1783[i11_0,i11_1_0,2i10_0_0_1_0+i10_0_0_1_1,i10_0_0_0,c2_1397,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input69'[i10_0_0_0,2i10_0_0_1_0+i10_0_0_1_1,i0.128,c2_1397,i1.2048] # id=1658, src_id=None, , instances=96 # dl = tensor_op_name: _dot.4 | hlo_id: 40 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 244.771us (48.000MiB, est bw: 205.627GB/s, 17.910% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 6, 2, 2, 128, 2048) %1781[i16_0_1432,i13_1_0,2i12_0_0_1_0+i12_0_0_1_1,i12_0_0_0,c2_1408,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input71'[i12_0_0_0,2i12_0_0_1_0+i12_0_0_1_1,i0.128,c2_1408,i1.2048] # id=1661, src_id=None, , instances=96 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 198.539us (24.000MiB, est bw: 126.755GB/s, 14.527% of tot. time) for bfloat16<128 x 512> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 6, 128, 2, 512) %'input68_local_1426'[i16_0_1432,i15_0_0_0_1,i15_0_0_0_0,c1_1418_2435,c2_1419_2435,i0.128,i3.2,i1.128+128i2.2+256p_1957_2435] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 6, 2, 2, 128) %'input68'[i15_0_0_0_1+2i15_0_0_0_0,p_1957_2435,c1_1418_2435,i0.128,c2_1419_2435,i3.2,i2.2,i1.128] # id=1667, src_id=None, , instances=192 # dl = tensor_op_name: _dot.6 | hlo_id: 51 | [[i0.128];[i1.128, i2.2, i3.2]] -> [[i0.128];[i1.128, i2.2, i3.2]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 6.033% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %1784[i37_0,i37_1_0,i38_0_0,c1_1442,c2_1443,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input78'[i38_0_0,c1_1442,i0.128,i1.2048+2048c2_1443] # id=1681, src_id=None, , instances=32 # dl = tensor_op_name: _dot.9 | hlo_id: 71 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 3.064% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %'1350.1923'[i11_0,i11_1_0,T_i2_0_2433,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 2, 512, 2048) %'add.4'[i11_0,i11_1_0,i0.128+128T_i2_0_2433,i1.2048] # id=1796, src_id=None, , instances=16 # dl = tensor_op_name: add.4_pftranspose_1350 | hlo_id: 15 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 3.064% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_reload_1777'[i16_0_1432,i13_1_0,i4_0_0_711_1780_2434,i4_0_1_1780_0_2434,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_spill_1774'[i4_0_0_711_1780_2434,i4_0_1_1780_0_2434,i16_0_1432,i13_1_0,i0.128,i1.2048] # id=1779, src_id=None, , instances=16 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 3.064% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2048) %'1354.1928'[i37_0,i37_1_0,T_i2_0_2436,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (4194304,) %'all_reduce.1-buffer-2494'[2097152i37_0+2048i0.128+1048576i37_1_0+i1.2048+262144T_i2_0_2436] # id=1805, src_id=None, , instances=16 # dl = tensor_op_name: all_reduce.1_pftranspose_1354 | hlo_id: 54 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 3.064% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_reload_1788'[i67_0,i67_1_0_0,i51_0_0_1791,i51_0_1_0_1791,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_spill_1785'[i51_0_0_1791,i51_0_1_0_1791,i67_0,i67_1_0_0,i0.128,i1.2048] # id=1790, src_id=None, , instances=16 # dl = tensor_op_name: _dot.8 | hlo_id: 114 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 3.064% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_reload_1788_reload_1794'[i2_0_1518,i2_1_0_1518_0,i51_0_0_1791_1793,i51_0_1_0_1791_1793,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_spill_1785'[i51_0_0_1791_1793,i51_0_1_0_1791_1793,i2_0_1518,i2_1_0_1518_0,i0.128,i1.2048] # id=1792, src_id=None, , instances=16 # dl = tensor_op_name: _dot.8 | hlo_id: 114 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 3.064% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 16, 128) %'get_tuple_element.2_local_1526'[i98_0_0_0_1543,c0_1520_0,c0_1520_1,c1_1521,i0.128,i1.16,i2.128] = load bfloat16<128 x 2048> non_local bfloat16 (4, 2, 128, 16, 128) %'get_tuple_element.2'[2c0_1520_0+c0_1520_1,c1_1521,i0.128,i1.16,i2.128] # id=1733, src_id=None, , instances=16 # dl = tensor_op_name: _dot.10 | hlo_id: 173 | [[i0.128];[i2.128, i1.16]] -> [[i0.128];[i2.128, i1.16]] +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.036 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.012 seconds +2025-11-04T21:38:45Z INFO 8739 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.024 seconds +2025-11-04T21:38:45Z INFO 8738 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:45Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.111 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.014 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.032 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8739 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 60284) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.484 seconds +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.231 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.020 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.095 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.068 seconds +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.543 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.023 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.013 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 272) %4(init=0.0)[i0.32,i1.16] = load float32<32 x 16> float32 (32, 16) %6[i0.32,i1.16] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 16) %10[i0.32,i1.16] = load float32<32 x 16> float32 (1, 512) %'inp'[i0.32,i1.16] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 12.028% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 12.767% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %1881[i34_0,i34_1_0_0,i35_0_0,c1_1532,c2_1533,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 2, 2048) %'input67'[i35_0_0,c1_1532,i0.128,c2_1533,i1.2048] # id=1742, src_id=None, , instances=32 # dl = tensor_op_name: _dot.2 | hlo_id: 32 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 51.143us (4.000MiB, est bw: 82.011GB/s, 7.919% of tot. time) for bfloat16<128 x 256> TongaSB partitions[3] bfloat16 (2, 2, 16, 128, 256) %'transpose.1_pftranspose_1450'[T_i2_1_0_1454,T_i2_0_1454,i3_0,i0.128,i1.256] = indirect_load bfloat16<128 x 256> {'CrossPassTensor': ''}bfloat16 (151936, 2, 2, 256) %'input60'[i0.128,T_i2_0_1454,T_i2_1_0_1454,i1.256] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[1] int32 (2, 128, 16, 1) %'input0_local_1493'[T_i2_1_0_1454,i0.128,i3_0,0] # id=1698, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=64 # dl = tensor_op_name: _gather.41 | hlo_id: 12 | [[i0.128];[i1.256]] -> [[i0.128];[i1.256]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 6.484% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2, 2, 512) %'intermediate0_pftranspose_1455'[i0_0,i1_1_0,i1_1_1_0,i0.128,i3.2,i2.2,i1.512] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 2, 2, 512) %'all_gather.1'[i1_1_0,0,i3.2,i1_1_1_0,i0.128,i0_0,i2.2,i1.512] # id=1701, src_id=None, , instances=16 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.128];[i1.512, i2.2, i3.2]] -> [[i0.128];[i1.512, i2.2, i3.2]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 6.484% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[3] bfloat16 (2, 2, 4, 128, 2, 2, 512) %'custom-call.177.1878'[i17_0_1521_1880,i16_0_1_0_1521_1880,i16_0_1_1_1521_1880,i0.128,i3.2,i2.2,i1.512] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 2, 2, 512) %'all_gather.1'[i16_0_1_0_1521_1880,0,i3.2,i16_0_1_1_1521_1880,i0.128,i17_0_1521_1880,i2.2,i1.512] # id=1737, src_id=None, , instances=16 # dl = tensor_op_name: _custom-call.177 | hlo_id: 24 | [[i0.128];[i1.512, i2.2, i3.2]] -> [[i0.128];[i1.512, i2.2, i3.2]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 6.484% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_reload_1885'[i64_0,i64_1_0_0,i48_0_1_0_1888,i48_0_0_1888,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_spill_1882'[i48_0_1_0_1888,i48_0_0_1888,i64_0,i64_1_0_0,i0.128,i1.2048] # id=1887, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 6.484% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_reload_1885_reload_1891'[i2_0_1578,i2_1_0_1578_0,i48_0_1_0_1888_1890,i48_0_0_1888_1890,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (2, 2, 2, 2, 128, 2048) %'_spill_1882'[i48_0_1_0_1888_1890,i48_0_0_1888_1890,i2_0_1578,i2_1_0_1578_0,i0.128,i1.2048] # id=1889, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 41.879us (8.000MiB, est bw: 200.308GB/s, 6.484% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 16, 128) %'get_tuple_element.1_local_1586'[i95_0_0_0_1603,c0_1580_0,c0_1580_1,c1_1581,i0.128,i1.16,i2.128] = load bfloat16<128 x 2048> non_local bfloat16 (4, 2, 128, 16, 128) %'get_tuple_element.1'[2c0_1580_0+c0_1580_1,c1_1581,i0.128,i1.16,i2.128] # id=1842, src_id=None, , instances=16 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i2.128, i1.16]] -> [[i0.128];[i2.128, i1.16]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 30.524us (8.000MiB, est bw: 274.819GB/s, 4.726% of tot. time) for bfloat16<128 x 1024> {'IntermediateTensor': ''}bfloat16 (2, 2, 512, 2, 2, 512) %'intermediate0'(init=0.0)[T_i0_0_1459,T_i0_1_0_1459,i0.128+128T_i0_1_1_1459_0,i2.2,T_i1_1_0_1459,i1.512] = store bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 1024) %'1455.2023'[T_i0_0_1459,T_i1_1_0_1459,T_i0_1_0_1459,T_i0_1_1_1459_0,i0.128,i1.512+512i2.2] # id=2021, src_id=None, , instances=32 # dl = tensor_op_name: intermediate0_pftranspose_1455 | hlo_id: 1 | [[i0.128];[i1.512, i2.2]] -> [[i0.128];[i1.512, i2.2]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 30.524us (8.000MiB, est bw: 274.819GB/s, 4.726% of tot. time) for bfloat16<128 x 1024> non_local bfloat16 (4194304,) %'dot.4-buffer-2754'[1024i95_0_0_0_1603+2048i0.128+262144i96_0_1603+i1.1024] = store bfloat16<128 x 1024> TongaSB partitions[2] bfloat16 (2, 16, 128, 1024) %1604[i95_0_0_0_1603,i96_0_1603,i0.128,i1.1024] # id=1846, src_id=None, , instances=32 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.1024]] -> [[i0.128];[i1.1024]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 26.236us (2.000MiB, est bw: 79.934GB/s, 4.062% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output2'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[5] int32 (2, 2, 2, 2, 4, 128, 1) %'scatter.6719.2278'[i111_0,i105_0,i105_1,i104_1_0_0,i104_1_0_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 4, 2, 128) %'transpose.19'[i111_0,i104_1_0_0,i105_0,i0.128,i104_1_0_1,i105_1,i1.128] # id=1860, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=64 # dl = tensor_op_name: _scatter.6719 | hlo_id: 187 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 60284) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.372 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.068 seconds +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8740 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.014 seconds +2025-11-04T21:38:46Z INFO 8739 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.074 seconds +2025-11-04T21:38:46Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.076 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.053 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.011 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.045 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.008 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.013 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.014 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.015 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.066 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.071 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.065 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.065 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.007 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.007 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.008 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.015 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8739 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.012 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8738 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.227 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.009 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:47Z INFO 8740 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8738 [Tensorizer]: BirCodeGen estimate #instances=2497 in sg0000 +2025-11-04T21:38:48Z INFO 8738 [Tensorizer]: IR signature: 5b877131f2ef8acfc34e97e3867e07024be6f656e97d33a85d081e6375f4e2da for nc00/sg0000/TensorizerBIR +2025-11-04T21:38:48Z INFO 8738 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8740 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 4.612 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:48Z INFO 8739 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.086 seconds +2025-11-04T21:38:48Z INFO 8738 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8738 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.160 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.121 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.122 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:48Z INFO 8739 [Tensorizer]: BirCodeGen estimate #instances=5010 in sg0001 +2025-11-04T21:38:48Z INFO 8739 [Tensorizer]: IR signature: ae3cf2eeac56439f4dbfa3195a8a1557e2d9206a5ca521c2c69b0869d598cc3d for nc00/sg0001/TensorizerBIR +2025-11-04T21:38:48Z INFO 8739 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.041 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:48Z INFO 8739 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8739 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.058 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8738 [Tensorizer]: BirCodeGen estimate #instances=2497 in sg0000 +2025-11-04T21:38:48Z INFO 8738 [Tensorizer]: IR signature: 59e1c733b0daccd547db5862709f43e810d5628a4d5446ad5ca95c693a22ceb7 for nc01/sg0000/TensorizerBIR +2025-11-04T21:38:48Z INFO 8738 [Tensorizer]: Weights total number of bytes: 262402 +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.097 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:48Z INFO 8738 [Tensorizer]: Successfully built model. +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.024 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8739 [Tensorizer]: BirCodeGen estimate #instances=5010 in sg0001 +2025-11-04T21:38:48Z INFO 8739 [Tensorizer]: IR signature: a1b6adb35cb835694160853d7728d7a2a42f91918c31b0588938b0c674982ff8 for nc01/sg0001/TensorizerBIR +2025-11-04T21:38:48Z INFO 8739 [Tensorizer]: Weights total number of bytes: 262146 +2025-11-04T21:38:48Z INFO 8739 [Tensorizer]: Successfully built model. +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.026 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.033 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.085 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.085 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.034 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.034 seconds +2025-11-04T21:38:48Z INFO 8740 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:49Z INFO 8740 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8740 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.419 seconds +2025-11-04T21:38:49Z INFO 8740 [Tensorizer]: BirCodeGen estimate #instances=27737 in sg0002 +2025-11-04T21:38:49Z INFO 8740 [Tensorizer]: IR signature: 40b0e2826a5aa877434cdd2afa7ab0c573e5ec22d18ed88beaee2cb56969f285 for nc00/sg0002/TensorizerBIR +2025-11-04T21:38:49Z INFO 8740 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:49Z INFO 8740 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8740 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.291 seconds +2025-11-04T21:38:50Z INFO 8740 [Tensorizer]: BirCodeGen estimate #instances=27737 in sg0002 +2025-11-04T21:38:50Z INFO 8740 [Tensorizer]: IR signature: f488c29018b2708cbfbc5b1a6d95ed80e62df8a6ac43f19858b47ac5d0410655 for nc01/sg0002/TensorizerBIR +2025-11-04T21:38:50Z INFO 8740 [Tensorizer]: Weights total number of bytes: 410376 +2025-11-04T21:38:50Z INFO 8740 [Tensorizer]: Successfully built model. +2025-11-04T21:38:50Z USER 8698 [root/Tensorizer/Tensorizer]: Tensorizer finished after 15.430 seconds +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: End tensorization +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input60 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input0 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input63 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input67 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input66 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input65 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input64 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input62 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input61 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input4 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input5 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input70 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input71 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input69 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input68 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input74 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input78 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input77 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input76 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input75 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input73 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input72 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input6 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input7 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input367 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input368 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input366 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input365 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input370 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input369 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Network input: input3 +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:50Z INFO 8698 [job.Frontend.0]: Job #0 finished +2025-11-04T21:38:50Z INFO 8698 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:38:50Z INFO 8698 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:38:50Z INFO 8698 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:38:50Z INFO 8698 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: BackendDriver has 6 states with 2 core LNC +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: BackendDriver: found partitions within VNC, using VNC + MT modular flow. +2025-11-04T21:38:50Z INFO 8698 [job.BIRLinker.1]: Creating directory nc00/sgLnk/sg00 +2025-11-04T21:38:50Z INFO 8698 [job.BIRLinker.2]: Creating directory nc01/sgLnk/sg00 +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: BackendDriver in_state.num_states 6 with 2 core LNC +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs nc00/sg00,nc01/sg00,nc00/sg01,nc01/sg01,nc00/sg02,nc01/sg02 --link-dir sgLnk/sg00 --vnc-nc-per-sengine 2 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels scalar_dynamic_offset,io,spill_reload,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:38:50Z INFO 8698 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:38:50Z INFO 9072 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Loading module from nc01/sg01/bir.json +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Loading module from nc00/sg01/bir.json +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Loading module from nc00/sg02/bir.json +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Loading module from nc01/sg02/bir.json +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Backend driver mtBackend: true numModules: 6 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e" +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Modular flow call graph is enabled +2025-11-04T21:38:50Z INFO 9072 [BackendDriver]: Internal partitioner is enabled +2025-11-04T21:38:50Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1926 blocks=6 instructions=1664 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:50Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:50Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:50Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:50Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:50Z USER 9072 (nc01/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:50Z USER 9072 (nc00/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z USER 9072 (nc01/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (nc00/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 159 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 159 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 592 memory location(s), 1 block(s), and 688 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 592 memory location(s), 1 block(s), and 688 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.232.2310}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:50Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z USER 9072 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.004 seconds +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 102mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.232.2310}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:50Z USER 9072 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.020 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 110mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z USER 9072 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.050 seconds +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 132mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z USER 9072 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.094 seconds +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 162mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 159 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.099 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 167mb, ru_maxrss: 215mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 159 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.218 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 221mb, ru_maxrss: 221mb (delta=6mb) +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 592 memory location(s), 1 block(s), and 688 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.233 seconds +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=7mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 592 memory location(s), 1 block(s), and 688 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:50Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.236 seconds +2025-11-04T21:38:50Z INFO 9072 [BackendPassManager]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=7mb) +2025-11-04T21:38:50Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:50Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=1926 blocks=6 instructions=1664 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:50Z USER 9072 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:50Z USER 9072 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:50Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=424 blocks=2 instructions=144 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=318 blocks=2 instructions=144 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=0mb) +2025-11-04T21:38:50Z USER 9072 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 424 memory location(s), 2 block(s), and 144 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1184 blocks=2 instructions=1376 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 318 memory location(s), 2 block(s), and 144 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1184 memory location(s), 2 block(s), and 1376 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:50Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 9072 [BackendPassManager]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=0mb) +2025-11-04T21:38:50Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:50Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1926 blocks=6 instructions=1664 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:50Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:50Z USER 9072 (nc01/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:50Z USER 9072 (nc00/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 222mb, ru_maxrss: 222mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 592 memory location(s), 1 block(s), and 688 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 592 memory location(s), 1 block(s), and 688 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=592 blocks=1 instructions=688 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:50Z USER 9072 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 224mb, ru_maxrss: 224mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:50Z USER 9072 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 225mb, ru_maxrss: 225mb (delta=0mb) +2025-11-04T21:38:50Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:50Z USER 9072 (nc00/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 226mb, ru_maxrss: 226mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 159 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:50Z USER 9072 (nc01/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 228mb, ru_maxrss: 228mb (delta=0mb) +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 159 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=159 blocks=1 instructions=72 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:50Z INFO 9072 (nc01/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 212 memory location(s), 1 block(s), and 72 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=212 blocks=1 instructions=72 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:50Z INFO 9072 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:50 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:50 2025 + +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Total count: 2495 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Matmult: 1281 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: TensorScalarPtr: 340 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: TensorTensor: 268 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: GenericCopy: 222 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Activation: 110 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: DMACopy: 96 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Load: 81 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Save: 70 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Memset: 10 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: CollectiveCompute: 3 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 96 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.105 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 307mb, ru_maxrss: 307mb (delta=83mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2485 memory location(s), 1 block(s), and 2495 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=2485 blocks=1 instructions=2495 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.010 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 311mb, ru_maxrss: 311mb (delta=4mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:50 2025 + +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Total count: 2497 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Matmult: 1281 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: TensorScalarPtr: 340 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: TensorTensor: 268 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: GenericCopy: 222 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Activation: 110 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: DMACopy: 97 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Load: 81 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Save: 71 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Memset: 10 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: CollectiveCompute: 3 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 96 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.136 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 321mb, ru_maxrss: 321mb (delta=90mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2485 memory location(s), 1 block(s), and 2497 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=2485 blocks=1 instructions=2497 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.014 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 322mb, ru_maxrss: 322mb (delta=1mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:50 2025 + +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Total count: 5010 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Matmult: 3656 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Load: 284 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: TensorScalarPtr: 254 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: GenericCopy: 245 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: TensorTensor: 240 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Activation: 164 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Save: 73 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: DMACopy: 66 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Memset: 12 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 64 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: unroll finished after 0.276 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 391mb, ru_maxrss: 391mb (delta=165mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2712 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:50 2025 + +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Total count: 5008 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Matmult: 3656 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Load: 284 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: TensorScalarPtr: 254 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: GenericCopy: 245 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: TensorTensor: 240 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Activation: 164 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Save: 72 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: DMACopy: 65 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Memset: 12 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: CollectiveCompute: 2 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 64 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: unroll finished after 0.284 seconds +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 391mb (delta=163mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=2712 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2712 memory location(s), 1 block(s), and 5008 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=2712 blocks=1 instructions=5008 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.019 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 376mb, ru_maxrss: 391mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.018 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 391mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:50 2025 + +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Total count: 15907 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Matmult: 12682 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: GenericCopy: 1516 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Load: 551 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Save: 331 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: TensorTensor: 159 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Activation: 119 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: TensorScalarPtr: 86 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Memset: 26 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: TensorReduce: 13 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Select: 4 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Iota: 3 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: unroll finished after 0.493 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 437mb, ru_maxrss: 437mb (delta=215mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6470 memory location(s), 1 block(s), and 15907 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=6470 blocks=1 instructions=15907 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.039 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 397mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:50 2025 + +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Total count: 15918 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Matmult: 12682 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: GenericCopy: 1516 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Load: 551 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Save: 342 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: TensorTensor: 159 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Activation: 119 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: TensorScalarPtr: 86 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Memset: 26 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: TensorReduce: 13 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: CollectiveCompute: 8 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Select: 4 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Iota: 3 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: unroll finished after 0.598 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 411mb, ru_maxrss: 437mb (delta=215mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6470 memory location(s), 1 block(s), and 15918 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=6470 blocks=1 instructions=15918 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.038 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 363mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.644 seconds +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: curr_vmrss: 363mb, ru_maxrss: 437mb (delta=215mb) +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=11479 blocks=6 instructions=46041 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:51Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=2383 blocks=2 instructions=4989 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:51Z USER 9072 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:51Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=2667 blocks=2 instructions=10017 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=6429 blocks=2 instructions=31035 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2667 memory location(s), 2 block(s), and 10017 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2379 memory location(s), 2 block(s), and 4989 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 363mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6429 memory location(s), 2 block(s), and 31035 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.007 seconds +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: curr_vmrss: 363mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=11475 blocks=6 instructions=46041 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1267_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:51Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1272_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.024 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 366mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.022 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.031 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.041 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 367mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.097 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 376mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.105 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.106 seconds +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=11475 blocks=6 instructions=46041 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:51Z USER 9072 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:51Z USER 9072 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:51Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=2667 blocks=2 instructions=10017 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=2379 blocks=2 instructions=4989 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=6429 blocks=2 instructions=31035 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.002 seconds +2025-11-04T21:38:51Z USER 9072 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2379 memory location(s), 2 block(s), and 4989 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 2667 memory location(s), 2 block(s), and 10017 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.009 seconds +2025-11-04T21:38:51Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6429 memory location(s), 2 block(s), and 31035 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.013 seconds +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:51Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=11475 blocks=6 instructions=46041 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z WARNING 9072 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 8 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.004 seconds +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z WARNING 9072 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z WARNING 9072 (nc00/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: instruction_reorder finished after 0.007 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 8 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: instruction_reorder finished after 0.004 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 16 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 377mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 448 bytes/partition +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 448 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.001 seconds +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.003 seconds +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: psum_legalization finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.002 seconds +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.008 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: psum_legalization finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.005 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z WARNING 9072 (nc01/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 16 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.007 seconds +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: vn_splitter finished after 0.010 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.005 seconds +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: vn_splitter finished after 0.008 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.015 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 378mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.019 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.024 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.008 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z WARNING 9072 (nc01/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 8 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.008 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Found 1 Splits CCs +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.040 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [build_flow_deps]: Allocs: 1189 instructions: 2493 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: constant_propagate finished after 0.034 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: lower_ac finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [build_flow_deps]: Allocs: 1190 instructions: 2496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z WARNING 9072 (nc00/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 15 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: constant_propagate finished after 0.037 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.015 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: lower_ac finished after 0.009 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.003 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.007 seconds +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: vn_splitter finished after 0.029 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: remat_optimization finished after 0.003 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 6900 edges +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [build_flow_deps]: Done build fdeps 6900 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: remat_optimization finished after 0.003 seconds +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:51Z INFO 9072 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.007 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.009 seconds +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: vn_splitter finished after 0.022 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.044 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.004 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1189 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=1189 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=1190 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: End DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: size = 290 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: found 708 edges +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: mean: 4.88276 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: median: 5.98549 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 5664 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: No split opportunities: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: lo = 290 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: total = 290 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 379mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [build_flow_deps]: Allocs: 1334 instructions: 5010 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 57 PSUM Banks +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 35742468 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3715 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 21495808 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2399 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4243456 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: size = 855 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: found 148 accumulation groups +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 6902 edges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [build_flow_deps]: Done build fdeps 6902 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.070 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: largest = _dot.3-t1658_i61 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: tensors = 10 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: requires 40960 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 []: find first defs for local +2025-11-04T21:38:51Z INFO 9072 []: find first defs for global +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.012 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1190 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=1190 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 15159 edges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [build_flow_deps]: Done build fdeps 15159 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=1191 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: size = 290 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: End DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: pre_sched finished after 0.067 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5010 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1334 blocks=1 instructions=5010 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: found 708 edges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: mean: 4.88276 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: median: 5.98549 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 5664 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [build_flow_deps]: Allocs: 1333 instructions: 5007 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: constant_propagate finished after 0.063 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: lower_ac finished after 0.003 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.008 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=1333 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=1334 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 380mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: lo = 290 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: total = 290 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.025 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 381mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: size = 326 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: found 1087 edges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: mean: 6.66871 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: median: 6.99997 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: adjacency vectors require 8696 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.010 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 381mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: 72 remat count +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Num intervals 855 Num locations 855 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: edge: 20447 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: mean: 47.8292 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: median: 43.9454 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.011 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: safe = 767 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: unsafe = 74 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: inf = 12 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: total = 853 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 855 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4 PSUM Banks +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Total: 853 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (853) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Rover zone: 0.931 (794) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.009 (8) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.060 (51) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.001 (1) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.999 (852) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.994 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: lo = 326 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: total = 326 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 35742468 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3715 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 21495808 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2399 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4243456 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.063 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.019 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.003 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 382mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1191 memory location(s), 1 block(s), and 2493 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=1191 blocks=1 instructions=2493 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 57238276, 25.8061% input load, 8.24377% output write, 65.9502% spill/reload [sg0000] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.005 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 7864320, 13.7396% out of total dma traffic(1.47709e+07) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: remat_optimization finished after 0.018 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 57 PSUM Banks +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.015 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 35742468 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3715 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 21495810 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2398 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4243456 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 15157 edges +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [build_flow_deps]: Done build fdeps 15157 Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 24 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 24 spill/reload memory locations +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: size = 856 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 12582912, 33.3333% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: found 148 accumulation groups +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: largest = _dot.3-t1658_i31 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: tensors = 10 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: requires 40960 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15908 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=3439 blocks=1 instructions=15908 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 []: find first defs for local +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 []: find first defs for global +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: constant_propagate finished after 0.115 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.015 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 3448 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 2180 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 103916036 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2889 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 27262978 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2957 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2129920 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 19489540 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3448 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 17301504 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2180 bytes +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 20447232, 35.723% out of total dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 36791044, 28.7479% input load, 12.8254% output write, 58.4267% spill/reload [sg0000] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 19489540 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3448 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 17301504 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2180 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4243456 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 172 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1072 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.020 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 383mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1151 memory location(s), 1 block(s), and 2454 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1151 blocks=1 instructions=2454 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: size = 952 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: 72 remat count +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: found 282 accumulation groups +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: largest = _dot.6-t1592_i7 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Num intervals 856 Num locations 856 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: edge: 20461 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: mean: 47.8061 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: median: 44.1866 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: lower_ac finished after 0.010 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 384mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 []: find first defs for local +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: remove_redundant_memsets: 5 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 []: find first defs for global +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: safe = 768 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: unsafe = 74 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: inf = 12 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: total = 854 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 856 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Total: 854 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (854) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Rover zone: 0.931 (795) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.009 (8) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.060 (51) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.001 (1) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.999 (853) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.993 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: pre_sched finished after 0.114 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5007 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 35742468 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3715 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 21495810 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2398 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4243456 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.032 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=1333 blocks=1 instructions=5007 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 43 Sb address +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.006 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1192 memory location(s), 1 block(s), and 2496 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=1192 blocks=1 instructions=2496 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 57238278, 25.8061% input load, 8.24377% output write, 65.9502% spill/reload [sg0000] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 48 Sb address +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.018 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.029 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1151 memory location(s), 1 block(s), and 2454 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=1151 blocks=1 instructions=2454 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: reserved space = 196864 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:51Z INFO 9072 []: find first defs for local +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:51Z INFO 9072 []: find first defs for global +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.004 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1151 memory location(s), 1 block(s), and 2454 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=1151 blocks=1 instructions=2454 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 7864320, 13.7396% out of total dma traffic(1.47709e+07) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1151 memory location(s), 1 block(s), and 2454 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=1151 blocks=1 instructions=2454 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 52 out of 230 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1151 memory location(s), 1 block(s), and 2454 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=1151 blocks=1 instructions=2454 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 385mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1151 memory location(s), 1 block(s), and 2455 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=1151 blocks=1 instructions=2455 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 2455, number of allocs: 1151 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2769-0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.000213 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2769-0] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: input0: [ 4 128 2048 ] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: input1: [ 4 128 2048 ] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: input2: [ 4 2048 128 ] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: output0: [ 4 128 2048 ] +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 1048576 +Memory Location: {reshape.16}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 1048576 +Memory Location: {reshape.24}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 2048 / 2048 = 1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Scratch sbuf for kernel I-2769-0: [105472, 165756) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 24 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 24 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.025 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 12582912, 33.3333% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: End DCE Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 0.034601 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.011 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 394mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2294 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2294 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 392mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2294 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2294 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 392mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2294 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2294 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 392mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1332 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=1332 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 392mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1333 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=1333 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 392mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 392mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 392mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:51Z INFO 9072 (nc00/sg02) [build_flow_deps]: Allocs: 3439 instructions: 15903 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: 179 remat count +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Num intervals 952 Num locations 952 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 3448 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 2179 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 19489540 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3448 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 17301506 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2179 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: size = 326 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 20447232, 35.723% out of total dma traffic +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 36791046, 28.7479% input load, 12.8254% output write, 58.4267% spill/reload [sg0000] +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 19489540 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3448 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 17301506 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2179 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4243456 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 172 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1072 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.044 seconds +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 394mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1152 memory location(s), 1 block(s), and 2457 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1152 blocks=1 instructions=2457 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: remat_optimization finished after 0.043 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: found 1087 edges +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: mean: 6.66871 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: median: 6.99997 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: adjacency vectors require 8696 bytes +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: edge: 34912 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: mean: 73.3445 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: median: 62.8963 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 394mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 395mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 395mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15127 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2990 blocks=1 instructions=15127 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: safe = 594 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: unsafe = 283 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: inf = 73 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: total = 950 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 194 #Pinned 0 #Safe 0 minCost 0.00452202 maxCost 0.0359378 locations 952 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: new candidates = 55 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.040 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 396mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2310 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2310 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Total: 950 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Allocated: 1.000 (950) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Rover zone: 0.774 (735) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Pre-rover zone: 0.013 (12) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Post-rover zone: 0.214 (203) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Blocks nothing: 0.001 (1) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Blocks tall: 0.999 (949) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.995 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: Success +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:51Z INFO 9072 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 396mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2310 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:51Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2310 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:51Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:51Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:51 2025 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 103916036 +2025-11-04T21:38:51Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2889 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 27262978 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2957 bytes +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2129920 +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.105 seconds +2025-11-04T21:38:51Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 396mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: lo = 326 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: total = 326 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:51Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:51Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.052 seconds +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 396mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:51Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:51Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove_redundant_memsets: 1 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.018 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 398mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 43 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 48 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.031 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 399mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1335 memory location(s), 1 block(s), and 5009 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=1335 blocks=1 instructions=5009 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 131179014, 60.0326% input load, 3.19739% output write, 36.77% spill/reload [sg0001] +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.042 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 399mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2310 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2310 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2310 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2310 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.071 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1152 memory location(s), 1 block(s), and 2457 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=1152 blocks=1 instructions=2457 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [build_flow_deps]: Allocs: 2310 instructions: 4458 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: reserved space = 196864 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:52Z INFO 9072 []: find first defs for local +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:52Z INFO 9072 []: find first defs for global +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 9961472, 7.5938% out of total dma traffic(7.87502e+07) +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.011 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1152 memory location(s), 1 block(s), and 2457 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=1152 blocks=1 instructions=2457 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 400mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1152 memory location(s), 1 block(s), and 2457 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=1152 blocks=1 instructions=2457 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 52 out of 231 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 401mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1152 memory location(s), 1 block(s), and 2457 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=1152 blocks=1 instructions=2457 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 401mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1152 memory location(s), 1 block(s), and 2458 instruction(s). Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=1152 blocks=1 instructions=2458 Max writers: 32 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 2458, number of allocs: 1152 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2769-0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.002549 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2769-0] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: input0: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: input1: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: input2: [ 4 2048 128 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: output0: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 0 +Memory Location: {reshape.16}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 0 +Memory Location: {reshape.24}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 2048 / 2048 = 1 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Scratch sbuf for kernel I-2769-0: [105472, 165756) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 12 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 12 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 2 PSUM Banks +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.044 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 404mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 103916036 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 2889 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 27262976 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 2958 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2129920 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 2 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 2 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: size = 951 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 7340032, 15.2174% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 52134 edges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [build_flow_deps]: Done build fdeps 52134 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: found 282 accumulation groups +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 10913 edges +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [build_flow_deps]: Done build fdeps 10913 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.044 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: largest = _dot.6-t1592_i40 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 408mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2310 memory location(s), 1 block(s), and 4458 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2310 blocks=1 instructions=4458 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 []: find first defs for local +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.011 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 412mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2298 memory location(s), 1 block(s), and 4430 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2298 blocks=1 instructions=4430 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 []: find first defs for global +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 0.098138 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.035 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 417mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2295 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2295 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 416mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2295 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2295 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 416mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2295 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2295 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: End DCE Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 7Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 2755 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 2908 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 88187396 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2755 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 25690114 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2908 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [build_flow_deps]: Allocs: 2990 instructions: 15126 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 17301504, 13.1892% out of total dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 113877510, 60.4059% input load, 3.68317% output write, 35.9109% spill/reload [sg0001] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 88187396 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2755 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 25690114 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2908 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 2129920 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2027 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.093 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 417mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1300 memory location(s), 1 block(s), and 4975 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1300 blocks=1 instructions=4975 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: 179 remat count +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Num intervals 951 Num locations 951 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.051 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 421mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2311 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2311 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 421mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2311 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2311 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2769-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,114948>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: edge: 34898 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: mean: 73.3922 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: median: 63.0205 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.087 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 423mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2298 memory location(s), 1 block(s), and 4430 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2298 blocks=1 instructions=4430 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: safe = 593 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: unsafe = 283 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: inf = 73 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: total = 949 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 194 #Pinned 0 #Safe 0 minCost 0.00452202 maxCost 0.0359378 locations 951 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: new candidates = 55 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 98 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Total: 949 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Allocated: 1.000 (949) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Rover zone: 0.774 (735) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Pre-rover zone: 0.012 (11) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Post-rover zone: 0.214 (203) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Blocks nothing: 0.001 (1) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Blocks tall: 0.999 (948) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.996 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: Success +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.032 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 422mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 64 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: pre_sched finished after 0.305 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 422mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2311 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3439 memory location(s), 1 block(s), and 15903 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3439 blocks=1 instructions=15903 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2311 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 422mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2311 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2311 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 8Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [build_flow_deps]: Allocs: 2311 instructions: 4461 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.019 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 103916036 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2889 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 27262976 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2958 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 2129920 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.150 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 424mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 424mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4366 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2233 blocks=1 instructions=4366 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 63 +2025-11-04T21:38:52Z USER 9072 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.010 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 425mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4366 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.015 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 425mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1334 memory location(s), 1 block(s), and 5006 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=1334 blocks=1 instructions=5006 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 131179012, 60.0326% input load, 3.19739% output write, 36.77% spill/reload [sg0001] +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 40220 edges +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [build_flow_deps]: Done build fdeps 40220 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 10915 edges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [build_flow_deps]: Done build fdeps 10915 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.037 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 424mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2311 memory location(s), 1 block(s), and 4461 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2311 blocks=1 instructions=4461 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.004 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 424mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2299 memory location(s), 1 block(s), and 4433 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2299 blocks=1 instructions=4433 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 54 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 9961472, 7.5938% out of total dma traffic(7.87502e+07) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 12 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 12 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 2 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 2 spill/reload memory locations +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.053 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2299 memory location(s), 1 block(s), and 4433 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.166 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1300 memory location(s), 1 block(s), and 4975 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=1300 blocks=1 instructions=4975 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2299 blocks=1 instructions=4433 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: reserved space = 196608 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: spill space = 6815744 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 6815744 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: size = 13 +2025-11-04T21:38:52Z INFO 9072 []: find first defs for local +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 7340032, 15.2174% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 []: find first defs for global +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 64 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: Num intervals 13 Num locations 13 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: lo = 13 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: total = 13 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 4194304 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.017 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1300 memory location(s), 1 block(s), and 4975 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=1300 blocks=1 instructions=4975 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: pre_sched finished after 0.336 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2990 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.027 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 427mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 4194304 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4369 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2234 blocks=1 instructions=4369 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 2755 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 2908 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 88187396 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 2755 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 25690112 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 2908 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 17301504, 13.1892% out of total dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 113877508, 60.4059% input load, 3.68317% output write, 35.9109% spill/reload [sg0001] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 88187396 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 2755 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 25690112 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 2908 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 2129920 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2027 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 4194304 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.093 seconds +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.012 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 427mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 427mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1299 memory location(s), 1 block(s), and 4972 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1300 memory location(s), 1 block(s), and 4975 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=1299 blocks=1 instructions=4972 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.005 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 427mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=1300 blocks=1 instructions=4975 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:52Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4369 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [TensorCopyAccel::Impl]: Accelerated 36 out of 256 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1300 memory location(s), 1 block(s), and 4975 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=1300 blocks=1 instructions=4975 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: peephole_opts finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1300 memory location(s), 1 block(s), and 4976 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=1300 blocks=1 instructions=4976 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 4976, number of allocs: 1300 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2513-0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Scan BKs time (s): 0.002254 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2513-0] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: input0: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: input1: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: input2: [ 4 2048 128 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: output0: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 0 +Memory Location: {reshape.60}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 0 +Memory Location: {reshape.68}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 2048 / 2048 = 1 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Scratch sbuf for kernel I-2513-0: [61440, 121724) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.136 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 429mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3376 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=3376 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3377 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=3377 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 437mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 98 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [LowerKernel]: Lower BKs time (s): 0.064573 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: lower_kernel finished after 0.029 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 439mb, ru_maxrss: 439mb (delta=2mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2443 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2443 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 439mb, ru_maxrss: 439mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2443 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2443 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 439mb, ru_maxrss: 439mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2443 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2443 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.056 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 440mb, ru_maxrss: 440mb (delta=3mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2990 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2990 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 439mb, ru_maxrss: 440mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2991 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2991 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 439mb, ru_maxrss: 440mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 439mb, ru_maxrss: 440mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 440mb, ru_maxrss: 440mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: size = 1278 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.051 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 442mb, ru_maxrss: 442mb (delta=3mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: size = 1154 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2459 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2459 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 442mb, ru_maxrss: 442mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2459 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2459 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: found 1645 edges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: mean: 2.57433 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: median: 1.68169 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: adjacency vectors require 13160 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: found 1583 edges +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: mean: 2.7435 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: median: 1.98285 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: adjacency vectors require 12664 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 54 Sb address +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.024 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 443mb, ru_maxrss: 443mb (delta=1mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2459 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2459 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: lo = 1204 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: total = 1278 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 443mb, ru_maxrss: 443mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2459 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2459 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 9Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [build_flow_deps]: Allocs: 2459 instructions: 6979 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.109 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 443mb, ru_maxrss: 443mb (delta=6mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: lo = 1080 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: total = 1154 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.073 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 442mb, ru_maxrss: 443mb (delta=3mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.136 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 443mb (delta=6mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1299 memory location(s), 1 block(s), and 4972 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=1299 blocks=1 instructions=4972 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: reserved space = 196608 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: spill space = 6815744 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 6815744 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: size = 13 +2025-11-04T21:38:52Z INFO 9072 []: find first defs for local +2025-11-04T21:38:52Z INFO 9072 []: find first defs for global +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.027 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 443mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 19244 edges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [build_flow_deps]: Done build fdeps 19244 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: build_fdeps finished after 0.039 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 443mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2459 memory location(s), 1 block(s), and 6979 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2459 blocks=1 instructions=6979 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: Num intervals 13 Num locations 13 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: lo = 13 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: total = 13 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 4194304 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.021 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 443mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1299 memory location(s), 1 block(s), and 4972 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=1299 blocks=1 instructions=4972 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: remove_redundancies finished after 0.007 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 443mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 4194304 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 6951 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.028 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2447 blocks=1 instructions=6951 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 441mb, ru_maxrss: 443mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 4194304 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.013 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 443mb, ru_maxrss: 443mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1299 memory location(s), 1 block(s), and 4972 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=1299 blocks=1 instructions=4972 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [TensorCopyAccel::Impl]: Accelerated 36 out of 255 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 445mb, ru_maxrss: 445mb (delta=2mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1299 memory location(s), 1 block(s), and 4972 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=1299 blocks=1 instructions=4972 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: peephole_opts finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 446mb, ru_maxrss: 446mb (delta=1mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1299 memory location(s), 1 block(s), and 4973 instruction(s). Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=1299 blocks=1 instructions=4973 Max writers: 32 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 4973, number of allocs: 1299 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2513-0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Scan BKs time (s): 0.001451 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2513-0] +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: input0: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: input1: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: input2: [ 4 2048 128 ] +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: output0: [ 4 128 2048 ] +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 1048576 +Memory Location: {reshape.60}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[262144,4],[2048,128],[1,2048]] +Offset: 1048576 +Memory Location: {reshape.68}@DRAM(2097152x2)#Internal DebugInfo: +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 2048 / 2048 = 1 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Scratch sbuf for kernel I-2513-0: [61440, 121724) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: seq_len=2048, seq_len2=2048, complete_seq_len2=2048 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [LowerKernel]: Lower BKs time (s): 0.100282 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: lower_kernel finished after 0.042 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 463mb, ru_maxrss: 463mb (delta=17mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2442 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2442 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2442 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2442 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 464mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2442 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2442 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 62 PSUM Banks +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.084 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 463mb, ru_maxrss: 464mb (delta=21mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2447 memory location(s), 1 block(s), and 6951 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2447 blocks=1 instructions=6951 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 64 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 32 memorylocations +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.051 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 463mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2458 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2458 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2458 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2458 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2513-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,70916>(128x4)#Internal DebugInfo: +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.037 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 462mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6887 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2382 blocks=1 instructions=6887 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.016 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 463mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2458 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2458 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.001 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 463mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2458 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2458 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 10Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.010 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 463mb, ru_maxrss: 464mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6887 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [build_flow_deps]: Allocs: 2458 instructions: 6976 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 19242 edges +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [build_flow_deps]: Done build fdeps 19242 Tue Nov 4 21:38:52 2025 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: build_fdeps finished after 0.027 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 465mb (delta=1mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2458 memory location(s), 1 block(s), and 6976 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2458 blocks=1 instructions=6976 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 6 PSUM Banks +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.177 seconds +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 465mb (delta=22mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: remove_redundancies finished after 0.005 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 465mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2446 memory location(s), 1 block(s), and 6948 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2446 blocks=1 instructions=6948 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 6 PSUM Banks +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.172 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 466mb, ru_maxrss: 466mb (delta=23mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 231771806 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3479 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 12751371 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3384 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 231136402 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3500 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 12736000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3776 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: size = 2046 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: size = 1794 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.048 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 470mb, ru_maxrss: 470mb (delta=5mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2446 memory location(s), 1 block(s), and 6948 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2446 blocks=1 instructions=6948 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: found 1271 accumulation groups +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: largest = _dot.199-t1193_i2 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: found 1147 accumulation groups +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: largest = _dot.199-t1193_i48 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 64 +2025-11-04T21:38:52Z INFO 9072 []: find first defs for local +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 []: find first defs for global +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.029 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 469mb, ru_maxrss: 470mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6884 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2381 blocks=1 instructions=6884 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 []: find first defs for local +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: 432 remat count +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:52Z USER 9072 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.015 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 471mb, ru_maxrss: 471mb (delta=1mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6884 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:52Z INFO 9072 []: find first defs for global +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Num intervals 1794 Num locations 1794 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: edge: 30203 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: mean: 33.6711 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: median: 26.4587 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: safe = 1519 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: unsafe = 214 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: inf = 59 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: total = 1792 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 196 #Pinned 0 #Safe 0 minCost 0.00452202 maxCost 1.13113 locations 1794 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: new candidates = 55 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Total: 1792 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Allocated: 1.000 (1792) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Rover zone: 0.900 (1613) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Pre-rover zone: 0.014 (25) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Post-rover zone: 0.086 (154) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Blocks nothing: 0.015 (26) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Blocks medium: 0.001 (2) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.716 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.714 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.714 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Blocks tall: 0.984 (1764) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.816 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: Success +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: 442 remat count +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 231136402 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3500 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 12736000 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3776 bytes +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.128 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 475mb, ru_maxrss: 475mb (delta=9mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Num intervals 2046 Num locations 2046 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: edge: 31789 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: mean: 31.0743 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: median: 23.3305 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.028 seconds +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 475mb (delta=0mb) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2992 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2992 blocks=1 instructions=15126 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 243872402, 89.6167% input load, 0% output write, 10.3833% spill/reload [sg0002] +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: safe = 1769 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: unsafe = 216 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: inf = 59 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: total = 2044 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 196 #Pinned 0 #Safe 0 minCost 0.00452202 maxCost 1.13113 locations 2046 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: new candidates = 55 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Total: 2044 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Allocated: 1.000 (2044) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Rover zone: 0.886 (1811) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Pre-rover zone: 0.033 (67) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Post-rover zone: 0.079 (162) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Slice zone: 0.002 (4) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Blocks nothing: 0.057 (116) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Blocks medium: 0.006 (12) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.588 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.612 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.842 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Blocks tall: 0.937 (1916) +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.742 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.981 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: Success +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 5246976, 2.15153% out of total dma traffic(2.1855e+08) +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:52Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:52Z INFO 9072 (nc00/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 231771806 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3479 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 12751371 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3384 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.352 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 475mb (delta=10mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: average loaded DMA size 3488 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: average saved DMA size 3776 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 225889426 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3488 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 12736000 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3776 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.028 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 475mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3378 memory location(s), 1 block(s), and 15840 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=3378 blocks=1 instructions=15840 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 244523177, 89.5074% input load, 1.63584e-06% output write, 10.4926% spill/reload [sg0002] +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 5246976, 2.15153% out of total dma traffic +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 238625426, 89.3883% input load, 0% output write, 10.6117% spill/reload [sg0002] +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 225889426 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3488 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 12736000 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3776 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3501 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.249 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 466mb, ru_maxrss: 475mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15115 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2980 blocks=1 instructions=15115 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 5242880, 2.14412% out of total dma traffic(2.18866e+08) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 177 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: average loaded DMA size 3467 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: average saved DMA size 3384 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 226528926 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3467 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 12751371 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3384 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 5242880, 2.14412% out of total dma traffic +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 239280297, 89.2775% input load, 1.67168e-06% output write, 10.7225% spill/reload [sg0002] +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 226528926 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3467 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 12751371 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3384 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3461 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.158 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 468mb, ru_maxrss: 475mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15830 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3367 blocks=1 instructions=15830 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 194 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 23 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 60 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 31 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 124 Sb address +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.454 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 475mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15115 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2980 blocks=1 instructions=15115 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: spill space = 4194304 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 4194304 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: size = 8 +2025-11-04T21:38:53Z INFO 9072 []: find first defs for local +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.362 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 475mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15830 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=3367 blocks=1 instructions=15830 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:53Z INFO 9072 []: find first defs for global +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: reserved space = 34824 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: spill space = 4201476 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 4222976 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: size = 15 +2025-11-04T21:38:53Z INFO 9072 []: find first defs for local +2025-11-04T21:38:53Z INFO 9072 []: find first defs for global +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: Num intervals 8 Num locations 8 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: lo = 8 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: total = 8 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 4194304 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.072 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: Num intervals 15 Num locations 15 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 477mb, ru_maxrss: 477mb (delta=2mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15115 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2980 blocks=1 instructions=15115 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: lo = 15 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: total = 15 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 4194304 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.042 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=2mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15830 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=3367 blocks=1 instructions=15830 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 4194304 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 4194304 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 4194304 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.022 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15830 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=3367 blocks=1 instructions=15830 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [TensorCopyAccel::Impl]: Accelerated 609 out of 1468 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.007 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15830 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=3367 blocks=1 instructions=15830 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: peephole_opts finished after 0.006 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 4194304 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.040 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 15834, number of allocs: 3367 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15115 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2980 blocks=1 instructions=15115 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [LowerKernel]: Scan BKs time (s): 0.0016 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [LowerKernel]: Lower BKs time (s): 3e-06 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [TensorCopyAccel::Impl]: Accelerated 609 out of 1329 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.012 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 475mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15115 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2980 blocks=1 instructions=15115 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: peephole_opts finished after 0.013 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 15119, number of allocs: 2980 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [LowerKernel]: Scan BKs time (s): 0.003007 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [LowerKernel]: Lower BKs time (s): 4e-06 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.045 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.004 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.009 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.013 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1267_i1}@SB<32,16384>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:53Z WARNING 9072 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1272_i1}@SB<96,17536>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.037 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 474mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 11Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [build_flow_deps]: Allocs: 3367 instructions: 15834 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.038 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 475mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.008 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 477mb, ru_maxrss: 477mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 12Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [build_flow_deps]: Allocs: 2980 instructions: 15119 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 52076 edges +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [build_flow_deps]: Done build fdeps 52076 Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: build_fdeps finished after 0.089 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 484mb, ru_maxrss: 484mb (delta=7mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 40225 edges +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [build_flow_deps]: Done build fdeps 40225 Tue Nov 4 21:38:53 2025 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: build_fdeps finished after 0.064 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 484mb, ru_maxrss: 484mb (delta=7mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: remove_redundancies finished after 0.020 seconds +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 484mb, ru_maxrss: 484mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: remove_redundancies finished after 0.008 seconds +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 484mb, ru_maxrss: 484mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:53Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.196 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 522mb, ru_maxrss: 522mb (delta=38mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.194 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 501mb, ru_maxrss: 522mb (delta=38mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.038 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 502mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=3367 blocks=1 instructions=15834 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.014 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 501mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3367 memory location(s), 1 block(s), and 15834 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.068 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 501mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2980 blocks=1 instructions=15119 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.013 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 499mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2980 memory location(s), 1 block(s), and 15119 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 2.427 seconds +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: curr_vmrss: 499mb, ru_maxrss: 522mb (delta=85mb) +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=15577 blocks=6 instructions=53459 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=6347 blocks=2 instructions=30953 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=4763 blocks=2 instructions=13771 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4763 memory location(s), 2 block(s), and 13771 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6347 memory location(s), 2 block(s), and 30953 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=4763 blocks=2 instructions=13771 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=6347 blocks=2 instructions=30953 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.004 seconds +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4763 memory location(s), 2 block(s), and 13775 instruction(s). Max writers: 65 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=4467 blocks=2 instructions=8735 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=4763 blocks=2 instructions=13775 Max writers: 65 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4467 memory location(s), 2 block(s), and 8735 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=4467 blocks=2 instructions=8735 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.010 seconds +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4467 memory location(s), 2 block(s), and 8741 instruction(s). Max writers: 65 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=4467 blocks=2 instructions=8741 Max writers: 65 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.024 seconds +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.022 seconds +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4763 memory location(s), 2 block(s), and 13779 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6353 memory location(s), 2 block(s), and 30971 instruction(s). Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=6353 blocks=2 instructions=30971 Max writers: 298 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.029 seconds +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4467 memory location(s), 2 block(s), and 8745 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.069 seconds +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6353 memory location(s), 2 block(s), and 30975 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.104 seconds +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: curr_vmrss: 495mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53499 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:54Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=2983 blocks=1 instructions=15130 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: reserved space = 164096 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: spill space = 46137344 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 46137344 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.014 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 498mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: reserved space = 164096 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: spill space = 46137344 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 46137344 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: size = 10 +2025-11-04T21:38:54Z INFO 9072 []: find first defs for local +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: reserved space = 6979584 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: spill space = 58720256 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 58720256 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: size = 9 +2025-11-04T21:38:54Z INFO 9072 []: find first defs for local +2025-11-04T21:38:54Z INFO 9072 []: find first defs for global +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: reserved space = 6979584 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: spill space = 58720256 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 58720256 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.027 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 498mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: reserved space = 4236300 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: spill space = 33872898 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 33918976 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:54Z INFO 9072 []: find first defs for global +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: size = 19 +2025-11-04T21:38:54Z INFO 9072 []: find first defs for local +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: Num intervals 10 Num locations 10 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:54Z INFO 9072 []: find first defs for global +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: lo = 10 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: total = 10 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 29360128 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 29360128 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 46137344 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.054 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 497mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: Num intervals 9 Num locations 9 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: reserved space = 4227072 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: spill space = 33872898 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 33918976 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.065 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 497mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15130 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: lo = 9 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: total = 9 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: Already used DRAM hwm: 4194304 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: Already used DRAM hwm: 4194304 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 37748736 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 37748736 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 58720256 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:54Z USER 9072 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.080 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 498mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: Num intervals 19 Num locations 19 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: lo = 19 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: total = 19 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 4194304 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 4194304 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 20987904 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 20987904 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 29691904 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.090 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 497mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.093 seconds +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: curr_vmrss: 494mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53499 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=4763 blocks=2 instructions=13779 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=6353 blocks=2 instructions=30975 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:54Z USER 9072 (sg02) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 494mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=4467 blocks=2 instructions=8745 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6353 memory location(s), 2 block(s), and 30975 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (sg01) [SubgraphForkPass]: sync_shared_allocations finished after 0.006 seconds +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 493mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.006 seconds +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 493mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4467 memory location(s), 2 block(s), and 8745 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4763 memory location(s), 2 block(s), and 13779 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.010 seconds +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53499 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2983 blocks=1 instructions=15130 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:54Z USER 9072 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.019 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.011 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.022 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.031 seconds +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.029 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.036 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15130 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.040 seconds +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53499 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:54Z USER 9072 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:54Z INFO 9072 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=7986 blocks=3 instructions=27110 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=7597 blocks=3 instructions=26389 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.123 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 506mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 7986 memory location(s), 3 block(s), and 27110 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.126 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 500mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 7597 memory location(s), 3 block(s), and 26389 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: nc_parallel_pass finished after 0.132 seconds +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: curr_vmrss: 496mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53499 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2983 blocks=1 instructions=15130 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15130 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:54Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2983 blocks=1 instructions=15130 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:54Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:54Z USER 9072 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 492mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:54Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware simulation time: 1456630 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: post_sched finished after 0.190 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 509mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 506mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.005 seconds +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 507mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware simulation time: 59031477 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: post_sched finished after 0.221 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 508mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 505mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z USER 9072 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.009 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 505mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware simulation time: 1431952 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: post_sched finished after 0.246 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 507mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 505mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z USER 9072 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.005 seconds +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 505mb, ru_maxrss: 522mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:54Z INFO 9072 [post_scheduler]: Time-aware simulation time: 59697783 +2025-11-04T21:38:55Z INFO 9072 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: post_sched finished after 0.493 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 533mb, ru_maxrss: 533mb (delta=11mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.005 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 531mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.008 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 531mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 [post_scheduler]: Time-aware simulation time: 2180498 +2025-11-04T21:38:55Z INFO 9072 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9072 (nc01/sg02) [ModuleForkPass]: post_sched finished after 0.611 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 533mb, ru_maxrss: 533mb (delta=11mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15130 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=2983 blocks=1 instructions=15130 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc01/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15130 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=2983 blocks=1 instructions=15130 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.016 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z INFO 9072 [post_scheduler]: Time-aware simulation time: 2339050 +2025-11-04T21:38:55Z INFO 9072 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9072 (nc00/sg02) [ModuleForkPass]: post_sched finished after 0.815 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 533mb (delta=11mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc00/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 522mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.017 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 522mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:55Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.847 seconds +2025-11-04T21:38:55Z INFO 9072 [BackendPassManager]: curr_vmrss: 522mb, ru_maxrss: 533mb (delta=11mb) +2025-11-04T21:38:55Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:55Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53495 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:55Z INFO 9072 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=4763 blocks=2 instructions=13779 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9072 (sg01) [SubgraphForkPass]: curr_vmrss: 521mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4763 memory location(s), 2 block(s), and 13779 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:55Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=4467 blocks=2 instructions=8745 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:55Z USER 9072 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 521mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=6353 blocks=2 instructions=30971 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 4467 memory location(s), 2 block(s), and 8745 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9072 (sg02) [SubgraphForkPass]: curr_vmrss: 520mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6353 memory location(s), 2 block(s), and 30971 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:55Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.011 seconds +2025-11-04T21:38:55Z INFO 9072 [BackendPassManager]: curr_vmrss: 520mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:55Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53495 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2983 blocks=1 instructions=15126 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 305 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 233 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 53 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 233 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 258 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 84 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 53 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 258 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 33 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 263 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 139 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 305 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 139 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 33 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 88 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 84 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 89 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 131 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 744 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 263 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 177 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 105 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 33 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 177 Sb address +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.302 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 522mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 166 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 131 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.056 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.357 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 17 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 821 PSUM Banks +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.022 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 525mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 13Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [build_flow_deps]: Allocs: 2234 instructions: 4374 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.387 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 105 Sb address +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.075 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 10802 edges +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [build_flow_deps]: Done build fdeps 10802 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.077 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 32 │ 9957277696 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 73984 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10502660 │ +│ Load │ Internal │ 161 │ 15204352 │ +│ Save │ Internal │ 108 │ 14680064 │ +│ Save │ Internal -> Output │ 19 │ 4718594 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 194 │ +│ 512 │ 1 │ +│ 1024 │ 16 │ +│ 2048 │ 90 │ +│ 4096 │ 42 │ +│ 1048576 │ 64 │ +│ 8388608 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 2145 #MatMult-Transposes 449 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ReportStats]: IO Tensor size combined: 457986564 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input1 │ ExternalInput │ int32 │ 8192 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Output │ bfloat16 │ 8388608 │ +│ intermediate0 │ Output │ bfloat16 │ 8388608 │ +│ intermediate3-buffer-2756 │ Internal │ bfloat16 │ 8388608 │ +│ dot.4-buffer-2754 │ Internal │ bfloat16 │ 8388608 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1_i0 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.16 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1_i1 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.24 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.29 │ Internal │ bfloat16 │ 4194304 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z USER 9072 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 17 PSUM Banks +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.107 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 178 PSUM Banks +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.079 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 166 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 14Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [build_flow_deps]: Allocs: 2233 instructions: 4371 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 10802 edges +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [build_flow_deps]: Done build fdeps 10802 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.055 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 525mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 15Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [build_flow_deps]: Allocs: 2382 instructions: 6891 +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.042 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 525mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 32 │ 9957277696 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 73984 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10502660 │ +│ Load │ Internal │ 161 │ 15204352 │ +│ Save │ Internal │ 108 │ 14680064 │ +│ Save │ Internal -> Output │ 18 │ 4718592 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 194 │ +│ 512 │ 1 │ +│ 1024 │ 16 │ +│ 2048 │ 90 │ +│ 4096 │ 42 │ +│ 1048576 │ 64 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 2145 #MatMult-Transposes 449 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ReportStats]: IO Tensor size combined: 457986564 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input1 │ ExternalInput │ int32 │ 8192 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Output │ bfloat16 │ 8388608 │ +│ intermediate0 │ Output │ bfloat16 │ 8388608 │ +│ intermediate3-buffer-2756 │ Internal │ bfloat16 │ 8388608 │ +│ dot.4-buffer-2754 │ Internal │ bfloat16 │ 8388608 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1_i0 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.16 │ Internal │ bfloat16 │ 4194304 │ +│ all_gather.1_i1 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.24 │ Internal │ bfloat16 │ 4194304 │ +│ reshape.29 │ Internal │ bfloat16 │ 4194304 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:55Z USER 9072 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 525mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4371 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 362 PSUM Banks +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:55Z USER 9072 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.587 seconds +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 525mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:55Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 19109 edges +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [build_flow_deps]: Done build fdeps 19109 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: dep_opt finished after 0.069 seconds +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 209 │ 68166148 │ +│ Load │ Input -> Internal │ 2 │ 524288 │ +│ Load │ Internal │ 181 │ 25690112 │ +│ Save │ Internal │ 125 │ 23592960 │ +│ Save │ Internal -> Output │ 9 │ 4194306 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:55Z INFO 9072 (nc00/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 193 │ +│ 1024 │ 112 │ +│ 2048 │ 42 │ +│ 4096 │ 172 │ +│ 1048576 │ 64 │ +│ 4194304 │ 3 │ +│ 8388608 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ReportStats]: MM Stats: #MatMults 4520 #MatMult-Transposes 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Input │ bfloat16 │ 8388608 │ +│ dot.7-buffer-2492 │ Internal │ bfloat16 │ 8388608 │ +│ dot.11-buffer-2497 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate5 │ Output │ bfloat16 │ 8388608 │ +│ intermediate0 │ Input │ bfloat16 │ 8388608 │ +│ all_reduce.1-buffer-2494 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate6-buffer-2499 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate6 │ Output │ bfloat16 │ 8388608 │ +│ add.4 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.60 │ Internal │ bfloat16 │ 4194304 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: report_stats finished after 0.010 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6891 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 40 Sb address +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.080 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 50 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.045 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 16Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [build_flow_deps]: Allocs: 2381 instructions: 6888 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 4 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 19106 edges +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [build_flow_deps]: Done build fdeps 19106 Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: dep_opt finished after 0.050 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 209 │ 68166148 │ +│ Load │ Input -> Internal │ 2 │ 524288 │ +│ Load │ Internal │ 181 │ 25690112 │ +│ Save │ Internal │ 125 │ 23592960 │ +│ Save │ Internal -> Output │ 8 │ 4194304 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 193 │ +│ 1024 │ 112 │ +│ 2048 │ 42 │ +│ 4096 │ 172 │ +│ 1048576 │ 64 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ReportStats]: MM Stats: #MatMults 4520 #MatMult-Transposes 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌───────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├───────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate3 │ Input │ bfloat16 │ 8388608 │ +│ dot.7-buffer-2492 │ Internal │ bfloat16 │ 8388608 │ +│ dot.11-buffer-2497 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate5 │ Output │ bfloat16 │ 8388608 │ +│ intermediate0 │ Input │ bfloat16 │ 8388608 │ +│ all_reduce.1-buffer-2494 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate6-buffer-2499 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate6 │ Output │ bfloat16 │ 8388608 │ +│ add.4 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.60 │ Internal │ bfloat16 │ 4194304 │ +└───────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: report_stats finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6888 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 36 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.888 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 533mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2983 blocks=1 instructions=15126 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 74 Sb address +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.114 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 541mb, ru_maxrss: 541mb (delta=8mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=2983 blocks=1 instructions=15126 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.021 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 541mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=2983 blocks=1 instructions=15126 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 1.027 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 541mb (delta=8mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 17Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [build_flow_deps]: Allocs: 2983 instructions: 15126 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 40226 edges +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [build_flow_deps]: Done build fdeps 40226 Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: dep_opt finished after 0.092 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 539mb, ru_maxrss: 541mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=2983 blocks=1 instructions=15126 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal │ 2 │ 8388608 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 486 │ 213270540 │ +│ Load │ Internal │ 32 │ 12586118 │ +│ Save │ Internal │ 324 │ 12736000 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 97 │ +│ 4096 │ 433 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ReportStats]: MM Stats: #MatMults 12554 #MatMult-Transposes 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ReportStats]: IO Tensor size combined: 348930064 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input1 │ ExternalInput │ int32 │ 8192 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ convert.53 │ Internal │ bfloat16 │ 8388608 │ +│ all_reduce.3-buffer-2076 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate84 │ Input │ bfloat16 │ 8388608 │ +│ dot.14-buffer-2074 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate83 │ Input │ bfloat16 │ 8388608 │ +│ add.9 │ Internal │ bfloat16 │ 8388608 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ add.9_pftranspose_996-t1615_i7 │ Internal │ bfloat16 │ 1048576 │ +│ add.9_pftranspose_996-t1615_i6 │ Internal │ bfloat16 │ 1048576 │ +│ add.9_pftranspose_996-t1615_i5 │ Internal │ bfloat16 │ 1048576 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: report_stats finished after 0.030 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 541mb, ru_maxrss: 541mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15126 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.195 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 542mb, ru_maxrss: 542mb (delta=1mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.022 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 18Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [build_flow_deps]: Allocs: 3370 instructions: 15845 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 50557 edges +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [build_flow_deps]: Done build fdeps 50557 Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: dep_opt finished after 0.085 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal │ 4 │ 8388608 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 487 │ 213274636 │ +│ Load │ Internal │ 46 │ 12905354 │ +│ Save │ Internal │ 341 │ 12751367 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 113 │ +│ 2048 │ 1 │ +│ 4096 │ 434 │ +│ 9496 │ 2 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ReportStats]: MM Stats: #MatMults 12678 #MatMult-Transposes 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ReportStats]: IO Tensor size combined: 348930064 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input1 │ ExternalInput │ int32 │ 8192 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────────┼──────────┼──────────┼──────────────┤ +│ add.9 │ Internal │ bfloat16 │ 8388608 │ +│ convert.53 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate84 │ Input │ bfloat16 │ 8388608 │ +│ dot.14-buffer-2074 │ Internal │ bfloat16 │ 8388608 │ +│ intermediate83 │ Input │ bfloat16 │ 8388608 │ +│ all_reduce.3-buffer-2076 │ Internal │ bfloat16 │ 8388608 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ -t3069 │ Internal │ float32 │ 1048576 │ +│ -t3063 │ Internal │ float32 │ 1048576 │ +│ -t3058 │ Internal │ float32 │ 1048576 │ +└──────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: report_stats finished after 0.006 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15845 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 1.341 seconds +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: curr_vmrss: 529mb, ru_maxrss: 542mb (delta=9mb) +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Inputs to assign_trigger_engine: modules=6 functions=6 allocs=15583 blocks=6 instructions=53495 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 118 DMA instructions. Moved 10 DMA instructions to CC's engines. +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 117 DMA instructions. Moved 9 DMA instructions to CC's engines. +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [AssignTriggerEngine]: Assigned trigger engine for 134 DMA instructions. Moved 9 DMA instructions to CC's engines. +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [AssignTriggerEngine]: Assigned trigger engine for 133 DMA instructions. Moved 8 DMA instructions to CC's engines. +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [AssignTriggerEngine]: Assigned trigger engine for 352 DMA instructions. Moved 11 DMA instructions to CC's engines. +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [AssignTriggerEngine]: Assigned trigger engine for 333 DMA instructions. Moved 9 DMA instructions to CC's engines. +2025-11-04T21:38:56Z INFO 9072 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: assign_trigger_engine finished after 0.031 seconds +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Output has 6 module(s), 6 function(s), 15583 memory location(s), 6 block(s), and 53495 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53495 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=3370 blocks=1 instructions=15845 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2983 blocks=1 instructions=15126 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2233 blocks=1 instructions=4371 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2382 blocks=1 instructions=6891 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2381 blocks=1 instructions=6888 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=2234 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4377 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6893 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6890 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.007 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15848 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.011 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15129 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.013 seconds +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=6 functions=6 allocs=15583 blocks=6 instructions=53511 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: assign_hwdge_engine finished after 0.013 seconds +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Output has 6 module(s), 6 function(s), 15583 memory location(s), 6 block(s), and 53511 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53511 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2233 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2382 blocks=1 instructions=6893 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=3370 blocks=1 instructions=15848 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2983 blocks=1 instructions=15129 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2381 blocks=1 instructions=6890 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 64 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 186 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 283 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 125 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 64 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 186 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 282 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 125 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6890 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6893 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2382 blocks=1 instructions=6893 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2381 blocks=1 instructions=6890 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 15 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 3 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 298 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 1 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 17 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 482 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 24 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 32 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 9 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 10 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 301 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 22 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 482 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 24 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 64 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 165 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 140 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 108 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2233 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=2234 blocks=1 instructions=4377 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15129 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2983 blocks=1 instructions=15129 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15848 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=3370 blocks=1 instructions=15848 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.006 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6890 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 64 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 165 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 141 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 108 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.008 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4377 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=2234 blocks=1 instructions=4377 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4377 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.009 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6893 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.007 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15129 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.007 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15848 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.019 seconds +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53511 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:56Z USER 9072 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:56Z INFO 9072 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=7597 blocks=3 instructions=26393 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=7986 blocks=3 instructions=27118 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 7597 memory location(s), 3 block(s), and 26393 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 7986 memory location(s), 3 block(s), and 27118 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: nc_parallel_pass finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:56Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53511 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2381 blocks=1 instructions=6890 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2983 blocks=1 instructions=15129 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6890 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15129 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2381 blocks=1 instructions=6890 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2983 blocks=1 instructions=15129 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2382 blocks=1 instructions=6893 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6893 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2382 blocks=1 instructions=6893 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2233 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=2234 blocks=1 instructions=4377 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=3370 blocks=1 instructions=15848 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4377 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15848 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2233 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=2234 blocks=1 instructions=4377 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=3370 blocks=1 instructions=15848 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.007 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2233 blocks=1 instructions=4374 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.009 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4377 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2234 blocks=1 instructions=4377 Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: lower_control finished after 0.022 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 528mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6890 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2381 blocks=1 instructions=6890 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: lower_control finished after 0.027 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6893 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2382 blocks=1 instructions=6893 Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 5259 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 5708 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 5708 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 5265 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 5711 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 5711 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: lower_control finished after 0.047 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 531mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15129 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc01/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=2983 blocks=1 instructions=15129 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: lower_control finished after 0.042 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 531mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15848 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z USER 9072 (nc00/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=3370 blocks=1 instructions=15848 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 9325 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 9333 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 9954 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 9954 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 9963 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 9963 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Finished dependency reduction: 20989 removed, new total 2299 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:56Z USER 9072 (nc01/sg00) [ModuleForkPass]: dep_reduction finished after 0.087 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 542mb, ru_maxrss: 542mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2233 memory location(s), 1 block(s), and 4374 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 15321 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 14763 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Finished dependency reduction: 20853 removed, new total 2300 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:56Z USER 9072 (nc00/sg00) [ModuleForkPass]: dep_reduction finished after 0.112 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 545mb, ru_maxrss: 545mb (delta=3mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2234 memory location(s), 1 block(s), and 4377 instruction(s). Max writers: 66 Max Readers: 448 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 16538 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 16538 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 15614 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 15614 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Finished dependency reduction: 37741 removed, new total 2634 +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:56Z USER 9072 (nc00/sg01) [ModuleForkPass]: dep_reduction finished after 0.136 seconds +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 547mb, ru_maxrss: 547mb (delta=5mb) +2025-11-04T21:38:56Z INFO 9072 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2382 memory location(s), 1 block(s), and 6893 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Finished dependency reduction: 37730 removed, new total 2632 +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:56Z USER 9072 (nc01/sg01) [ModuleForkPass]: dep_reduction finished after 0.165 seconds +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 547mb, ru_maxrss: 547mb (delta=5mb) +2025-11-04T21:38:56Z INFO 9072 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2381 memory location(s), 1 block(s), and 6890 instruction(s). Max writers: 66 Max Readers: 496 +2025-11-04T21:38:57Z INFO 9072 (nc00/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sg02) [DepReduction]: Finished dependency reduction: 83724 removed, new total 4715 +2025-11-04T21:38:57Z INFO 9072 (nc00/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:57Z USER 9072 (nc00/sg02) [ModuleForkPass]: dep_reduction finished after 0.233 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 549mb, ru_maxrss: 549mb (delta=7mb) +2025-11-04T21:38:57Z INFO 9072 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3370 memory location(s), 1 block(s), and 15848 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sg02) [DepReduction]: Finished dependency reduction: 69964 removed, new total 3879 +2025-11-04T21:38:57Z INFO 9072 (nc01/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:57Z USER 9072 (nc01/sg02) [ModuleForkPass]: dep_reduction finished after 0.256 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 549mb, ru_maxrss: 549mb (delta=7mb) +2025-11-04T21:38:57Z INFO 9072 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2983 memory location(s), 1 block(s), and 15129 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:57Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.320 seconds +2025-11-04T21:38:57Z INFO 9072 [BackendPassManager]: curr_vmrss: 544mb, ru_maxrss: 549mb (delta=7mb) +2025-11-04T21:38:57Z USER 9072 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:57Z INFO 9072 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=15583 blocks=6 instructions=53511 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=7986 blocks=3 instructions=27118 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running bir_linker +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=7597 blocks=3 instructions=26393 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: DMA Descriptor ReUse Enabled. +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: Added a new SpillReload Que qSPPIOParam0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e/nc01/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e/nc00/sgLnk/sg00/tensor_map.json +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: PostLink Stats: #MatMults 136739 #MatMult-Transposes 19275 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: Total Intermediate MMTs 432 #out: 0 #inp: 432 #symmetric: 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: PostLink Stats: #MatMults 136863 #MatMult-Transposes 19275 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: Total Intermediate MMTs 432 #out: 0 #inp: 432 #symmetric: 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: bir_linker finished after 0.549 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 852mb, ru_maxrss: 852mb (delta=303mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 406071194, 72.0732% input load, 2.19491% output write, 25.7318% spill/reload +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: postlnk_dma_report finished after 0.006 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running report_stats +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 32 │ 9957277696 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 73984 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10502660 │ +│ Load │ Internal │ 161 │ 15204352 │ +│ Save │ Internal │ 108 │ 14680064 │ +│ Save │ Internal -> Output │ 18 │ 4718592 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 194 │ +│ 512 │ 1 │ +│ 1024 │ 16 │ +│ 2048 │ 90 │ +│ 4096 │ 42 │ +│ 1048576 │ 64 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 209 │ 68166148 │ +│ Load │ Input -> Internal │ 2 │ 524288 │ +│ Load │ Internal │ 181 │ 25690112 │ +│ Save │ Internal │ 125 │ 23592960 │ +│ Save │ Internal -> Output │ 8 │ 4194304 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 193 │ +│ 1024 │ 112 │ +│ 2048 │ 42 │ +│ 4096 │ 172 │ +│ 1048576 │ 64 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal │ 2 │ 8388608 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 486 │ 213270540 │ +│ Load │ Internal │ 32 │ 12586118 │ +│ Save │ Internal │ 324 │ 12736000 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 97 │ +│ 4096 │ 433 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: MM Stats: #MatMults 19219 #MatMult-Transposes 6379 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: IO Tensor size combined: 6781451308 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 8388608 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: report_stats finished after 0.011 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:57Z INFO 9072 []: find first defs for local +2025-11-04T21:38:57Z INFO 9072 []: find first defs for global +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: Real CC buffer size 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.042 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: spill space = 470810680 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 470925312 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: bir_linker finished after 0.654 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=303mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 406726069, 72.0359% input load, 2.19138% output write, 25.7727% spill/reload +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: postlnk_dma_report finished after 0.006 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running report_stats +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 32 │ 9957277696 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 73984 │ +│ Load │ ExternalInput -> Internal │ 28 │ 10502660 │ +│ Load │ Internal │ 161 │ 15204352 │ +│ Save │ Internal │ 108 │ 14680064 │ +│ Save │ Internal -> Output │ 19 │ 4718594 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 64 │ 1 │ +│ 128 │ 2 │ +│ 256 │ 194 │ +│ 512 │ 1 │ +│ 1024 │ 16 │ +│ 2048 │ 90 │ +│ 4096 │ 42 │ +│ 1048576 │ 64 │ +│ 8388608 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal -> ExternalOutput │ 64 │ 2147483648 │ +│ DMACopy │ Internal -> Output │ 1 │ 16777216 │ +│ DMACopy (Spill) │ Internal │ 64 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 209 │ 68166148 │ +│ Load │ Input -> Internal │ 2 │ 524288 │ +│ Load │ Internal │ 181 │ 25690112 │ +│ Save │ Internal │ 125 │ 23592960 │ +│ Save │ Internal -> Output │ 9 │ 4194306 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 193 │ +│ 1024 │ 112 │ +│ 2048 │ 42 │ +│ 4096 │ 172 │ +│ 1048576 │ 64 │ +│ 4194304 │ 3 │ +│ 8388608 │ 2 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 12582912 │ +│ DMACopy │ Internal │ 4 │ 8388608 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 487 │ 213274636 │ +│ Load │ Internal │ 46 │ 12905354 │ +│ Save │ Internal │ 341 │ 12751367 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 3 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 113 │ +│ 2048 │ 1 │ +│ 4096 │ 434 │ +│ 9496 │ 2 │ +│ 4194304 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: MM Stats: #MatMults 19343 #MatMult-Transposes 6379 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: IO Tensor size combined: 6781451308 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 8388608 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 8388608 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: report_stats finished after 0.012 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:38:57Z INFO 9072 []: find first defs for local +2025-11-04T21:38:57Z INFO 9072 []: find first defs for global +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.041 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.034 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: spill space = 470810680 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 470925312 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: size = 86 +2025-11-04T21:38:57Z INFO 9072 []: find first defs for local +2025-11-04T21:38:57Z INFO 9072 []: find first defs for global +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 86 Num locations 86 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: lo = 86 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: total = 86 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 58720256 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 93335552 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.026 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:57Z USER 9072 [BackendPassManager]: nc_parallel_pass finished after 0.742 seconds +2025-11-04T21:38:57Z INFO 9072 [BackendPassManager]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=303mb) +2025-11-04T21:38:57Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:57Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=16611 blocks=8 instructions=53595 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:57Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=8 allocs=16611 blocks=8 instructions=53595 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:57Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 16611 memory location(s), 8 block(s), and 53595 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:57Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.004 seconds +2025-11-04T21:38:57Z INFO 9072 [BackendPassManager]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z USER 9072 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:57Z INFO 9072 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=8 allocs=16611 blocks=8 instructions=53595 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.023 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.024 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.006 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.006 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.014 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.013 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27160 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26435 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=8500 blocks=4 instructions=27160 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=8111 blocks=4 instructions=26435 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.003 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.003 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26442 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27167 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=8111 blocks=4 instructions=26442 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=8500 blocks=4 instructions=27167 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc01/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 6240/6240 (100% DGE) + power-of-2 partition : 6241/6274 (99.474% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 6242/6275 (99.4741% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 8711/8719 (99.9082% DGE) + power-of-2 partition : 8711/9028 (96.4887% DGE) + > 3 dimensional : 0/8 (0% DGE) + non-integer desc size : 0/0 + total : 8711/9028 (96.4887% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 29 + Transpose : 1792 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 1824/1824 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: lower_dma finished after 0.069 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26442 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=8111 blocks=4 instructions=26442 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z INFO 9072 (nc00/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 6240/6240 (100% DGE) + power-of-2 partition : 6269/6332 (99.0051% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 6270/6333 (99.0052% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/170 (99.4118% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/170 (99.4118% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 8716/8724 (99.9083% DGE) + power-of-2 partition : 8716/9066 (96.1394% DGE) + > 3 dimensional : 0/8 (0% DGE) + non-integer desc size : 0/0 + total : 8716/9066 (96.1394% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 29 + Transpose : 1792 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 1824/1824 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: lower_dma finished after 0.073 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27167 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=8500 blocks=4 instructions=27167 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: expand_all_engine finished after 0.007 seconds +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26442 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:57Z INFO 9072 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=8111 blocks=4 instructions=26442 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: expand_all_engine finished after 0.008 seconds +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27167 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:57Z USER 9072 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:57Z INFO 9072 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=8500 blocks=4 instructions=27167 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.034 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26442 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=8111 blocks=4 instructions=26442 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.036 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27167 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=8500 blocks=4 instructions=27167 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: expand_inst_late finished after 0.038 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26741 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=8111 blocks=4 instructions=26741 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [SeqInstOpt]: Removing 160 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [SeqInstOpt]: Removing 129 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.006 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: expand_inst_late finished after 0.039 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 26452 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=8111 blocks=4 instructions=26452 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27466 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=8500 blocks=4 instructions=27466 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [SeqInstOpt]: Removing 160 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [SeqInstOpt]: Removing 129 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.005 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 27177 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=8500 blocks=4 instructions=27177 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: lower_sync finished after 0.018 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 28370 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=8111 blocks=4 instructions=28370 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: lower_act finished after 0.005 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 624mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 28385 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=8111 blocks=4 instructions=28385 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: lower_sync finished after 0.019 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 625mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 29262 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=8500 blocks=4 instructions=29262 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: lower_act finished after 0.005 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 625mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 29278 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=8500 blocks=4 instructions=29278 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: lower_dve finished after 0.102 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 639mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 28385 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=8111 blocks=4 instructions=28385 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: lower_dve finished after 0.103 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 639mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: lower_ap finished after 0.009 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 639mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 29278 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 28385 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=8500 blocks=4 instructions=29278 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=8111 blocks=4 instructions=28385 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:58Z INFO 9072 []: find first defs for local reg +2025-11-04T21:38:58Z INFO 9072 []: find first defs for global reg +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: lower_ap finished after 0.009 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 639mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 29278 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=8500 blocks=4 instructions=29278 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:58Z INFO 9072 []: find first defs for local reg +2025-11-04T21:38:58Z INFO 9072 []: find first defs for global reg +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:58Z INFO 9072 []: find first defs for local reg +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 []: find first defs for global reg +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:38:58Z INFO 9072 []: find first defs for local reg +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:58Z INFO 9072 []: find first defs for global reg +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:58Z INFO 9072 []: find first defs for local reg +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:58Z INFO 9072 []: find first defs for global reg +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:38:58Z INFO 9072 []: find first defs for local reg +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:58Z INFO 9072 []: find first defs for global reg +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:58Z USER 9072 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.078 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 28385 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:58Z USER 9072 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.080 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 29278 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: nc_parallel_pass finished after 0.438 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: vnc_remote_addr_map finished after 0.002 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Output has 2 module(s), 8 function(s), 16611 memory location(s), 8 block(s), and 57663 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: Running vnc_link +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 [VncLink]: Found 0 remote updates +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: vnc_link finished after 0.001 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Output has 2 module(s), 8 function(s), 16611 memory location(s), 8 block(s), and 57663 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=8500 blocks=4 instructions=29278 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=8111 blocks=4 instructions=28385 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00/sgLnk) [ModuleForkPass]: birverifier finished after 0.077 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 29278 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01/sgLnk) [ModuleForkPass]: birverifier finished after 0.079 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 28385 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.083 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:58Z INFO 9072 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.015 seconds +2025-11-04T21:38:58Z INFO 9072 (sg00) [SubgraphForkPass]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 16611 memory location(s), 8 block(s), and 57663 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: subgraph_parallel_pass finished after 0.017 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 644mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc00/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:58Z USER 9072 (nc01/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=8500 blocks=4 instructions=29278 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=8111 blocks=4 instructions=28385 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89234 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000626575 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: +┌────────────────┬────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼────────────┤ +│ ExternalInput │ 1.89234 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.00062466 │ +└────────────────┴────────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 19683 │ +│ LDWEIGHTS │ 19580 │ +│ EVENT_SEMAPHORE │ 2085 │ +│ UNKNOWN(0xd4) │ 1536 │ +│ ACTIVATE │ 1162 │ +│ COPY │ 979 │ +│ CAST │ 942 │ +│ TENSOR_TENSOR │ 923 │ +│ PSEUDO_DMA_TRIGGER │ 495 │ +│ UNKNOWN(0x9b) │ 320 │ +│ UNKNOWN(0x9a) │ 320 │ +│ GATHER │ 291 │ +│ POOL_BUFFER_LOAD │ 291 │ +│ TENSOR_SCALAR_ADDR │ 287 │ +│ MEMSET │ 177 │ +│ UNKNOWN(0xda) │ 169 │ +│ UNKNOWN(0xd3) │ 145 │ +│ TENSOR_REDUCE │ 141 │ +│ UNKNOWN(0x92) │ 136 │ +│ RECIPROCAL │ 131 │ +│ DVE_READ_INDICES │ 128 │ +│ UNKNOWN(0x24) │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ TENSOR_SCALAR │ 69 │ +│ UNKNOWN(0xd8) │ 52 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ LOAD_MASK_SELECT │ 20 │ +│ STREAM_SHUFFLE │ 20 │ +│ ACT_TABLE_LOAD │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ UNKNOWN(0xd9) │ 8 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ MOVE │ 7 │ +│ UNKNOWN(0xe8) │ 6 │ +│ IOTA │ 3 │ +│ UNKNOWN(0xe5) │ 2 │ +│ ALU_OP │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 3209 │ +│ Scalar │ 4320 │ +│ Tensor │ 39586 │ +│ SyncDMA │ 0 │ +│ Vector │ 3070 │ +│ Sync │ 525 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:58Z USER 9072 (nc00/sgLnk) [Codegen]: isa_gen finished after 0.248 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 602 │ +│ qDVESpillReload0_defId_2 │ 142 │ +│ qPoolSpillReload0_defId_0 │ 163840 │ +│ qPoolSpillReload0_defId_1 │ 163840 │ +│ qPoolSpillReload0_defId_2 │ 207 │ +│ qSPIO0 │ 86088 │ +│ qSPPIOParam0 │ 56 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 8550 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 423327 (0.00630806 GB) +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qSPIO0 │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qSPPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 144 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌───────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├───────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2513-0_grp_14_sec_0_mhlo_exponential_6_b0_i0_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2513-0_b3_grp_15_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2513-0_b0_grp_14_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2769-0_grp_14_sec_0_mhlo_exponential_6_b1_i0_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2513-0_grp_12_sec_0_mhlo_exponential_6_b2_i0_sg0001 │ Internal │ bfloat16 │ 16 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ all-reduce.465.2514_sg0001 │ Internal │ bfloat16 │ 27 │ +│ compare.2.1760_sg0001 │ Internal │ int32 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 298 │ +└───────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 19435 │ +│ LDWEIGHTS │ 19332 │ +│ EVENT_SEMAPHORE │ 1918 │ +│ UNKNOWN(0xd4) │ 1529 │ +│ ACTIVATE │ 1155 │ +│ CAST │ 942 │ +│ TENSOR_TENSOR │ 921 │ +│ COPY │ 851 │ +│ PSEUDO_DMA_TRIGGER │ 456 │ +│ UNKNOWN(0x9a) │ 320 │ +│ UNKNOWN(0x9b) │ 320 │ +│ TENSOR_SCALAR_ADDR │ 287 │ +│ UNKNOWN(0xda) │ 169 │ +│ MEMSET │ 163 │ +│ UNKNOWN(0xd3) │ 145 │ +│ UNKNOWN(0x92) │ 136 │ +│ TENSOR_REDUCE │ 136 │ +│ RECIPROCAL │ 129 │ +│ UNKNOWN(0x24) │ 128 │ +│ TENSOR_SCALAR │ 67 │ +│ UNKNOWN(0xd8) │ 52 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ LOAD_MASK_SELECT │ 16 │ +│ STREAM_SHUFFLE │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ ACT_TABLE_LOAD │ 15 │ +│ UNKNOWN(0xd9) │ 8 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ MOVE │ 7 │ +│ UNKNOWN(0xe8) │ 6 │ +│ IOTA │ 3 │ +│ ALU_OP │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 2569 │ +│ Scalar │ 4169 │ +│ Tensor │ 39085 │ +│ SyncDMA │ 0 │ +│ Vector │ 2432 │ +│ Sync │ 479 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:58Z USER 9072 (nc00/sgLnk) [Codegen]: dma_desc_gen finished after 0.036 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:58Z USER 9072 (nc01/sgLnk) [Codegen]: isa_gen finished after 0.289 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 596 │ +│ qDVESpillReload0_defId_2 │ 2 │ +│ qPoolSpillReload0_defId_0 │ 163840 │ +│ qPoolSpillReload0_defId_1 │ 163840 │ +│ qPoolSpillReload0_defId_2 │ 7 │ +│ qSPIO0 │ 86084 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 8206 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 422577 (0.00629689 GB) +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qSPIO0 │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌───────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├───────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2513-0_grp_12_sec_0_mhlo_exponential_6_b3_i0_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2769-0_grp_12_sec_0_mhlo_exponential_6_b1_i0_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2513-0_b2_grp_13_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2769-0_grp_15_sec_0_mhlo_exponential_6_b1_i0_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2769-0_b1_grp_12_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2513-0_grp_13_sec_0_mhlo_exponential_6_b2_i0_sg0001 │ Internal │ bfloat16 │ 16 │ +│ compare.2.1760_sg0001 │ Internal │ int32 │ 27 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 297 │ +└───────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:58Z USER 9072 (nc01/sgLnk) [Codegen]: dma_desc_gen finished after 0.035 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:38:58Z USER 9072 (nc00/sgLnk) [Codegen]: debug_info_gen finished after 0.062 seconds +2025-11-04T21:38:58Z USER 9072 (nc00/sgLnk) [ModuleForkPass]: codegen finished after 0.363 seconds +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 673mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 8500 memory location(s), 4 block(s), and 29278 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 (nc01/sgLnk) [Codegen]: debug_info_gen finished after 0.050 seconds +2025-11-04T21:38:58Z USER 9072 (nc01/sgLnk) [ModuleForkPass]: codegen finished after 0.389 seconds +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 673mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 8111 memory location(s), 4 block(s), and 28385 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: mod_parallel_pass finished after 0.393 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 673mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: Running hbm_usage +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 1.125KB │ 101.312KB │ +│ CCE │ 1.312MB │ 48.000B │ +│ Transpose │ 0.000B │ 5.000MB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 16.000KB │ 127.250KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc00/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.739GB │ +│ Model Code │ 3.095MB │ +│ Model Constants │ 657.012KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 89.008MB │ +│ DMA Ring IO │ 1.329MB │ +│ DMA Ring Spill │ 5.223MB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 1.062KB │ 89.656KB │ +│ CCE │ 1.312MB │ 48.000B │ +│ Transpose │ 0.000B │ 5.000MB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 15.500KB │ 111.500KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:38:58Z INFO 9072 (nc01/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.707GB │ +│ Model Code │ 2.974MB │ +│ Model Constants │ 655.004KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 56.000MB │ +│ DMA Ring IO │ 1.329MB │ +│ DMA Ring Spill │ 5.196MB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:58Z INFO 9072 [HBMUsage]: Total estimated HBM usage is: 3.803GB +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: hbm_usage finished after 0.006 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 673mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Output has 2 module(s), 8 function(s), 16611 memory location(s), 8 block(s), and 57663 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: Running neff_packager +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=8 allocs=16611 blocks=8 instructions=57663 Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1688_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1605-1690_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1616-1692_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2015_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2002_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1545-1641_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1556-1643_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1567-1645_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1577-1647_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1799_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.27-1134-1355_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1568_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1688_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1605-1690_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1616-1692_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2015_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_2002_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1545-1641_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1556-1643_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1567-1645_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1577-1647_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1799_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1568_CRSM.npy +2025-11-04T21:38:58Z INFO 9072 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:38:58Z WARNING 9072 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e/metrics.json +2025-11-04T21:38:58Z WARNING 9072 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:38:58Z INFO 9072 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff +2025-11-04T21:38:58Z INFO 9072 [NeffFileWriter]: IR signature: 1ad0472a9e7631754b31a760a7d927aa for neff artifacts +2025-11-04T21:38:58Z USER 9072 [BackendPassManager]: neff_packager finished after 0.137 seconds +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: curr_vmrss: 673mb, ru_maxrss: 852mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9072 [BackendPassManager]: Output has 2 module(s), 8 function(s), 16611 memory location(s), 8 block(s), and 57663 instruction(s). Max writers: 299 Max Readers: 5434 +2025-11-04T21:38:58Z INFO 9072 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.042969 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.042969 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.003906 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local and shared │ 0.054688 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.006348 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: shared │ 0.054688 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.003906 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local and shared │ 0.027653 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.003933 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: shared │ 0.031590 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.003906 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.054688 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.086926 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.438583 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ sg01 │ Peak scratchpad usage: local │ 0.003906 GB │ +│ nc01 │ sg01 │ Total size of allocated tensors: local │ 0.006348 GB │ +│ nc01 │ sg02 │ Peak scratchpad usage: local │ 0.003906 GB │ +│ nc01 │ sg02 │ Total size of allocated tensors: local │ 0.003906 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.003906 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.086926 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:38:58Z INFO 9072 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ dot.4 │ bfloat16 │ 1 │ 8.000000 MB │ +│ get_tuple_element.1 │ bfloat16 │ 1 │ 4.000000 MB │ +│ reshape.16 │ bfloat16 │ 1 │ 4.000000 MB │ +│ reshape.24 │ bfloat16 │ 1 │ 4.000000 MB │ +│ reshape.29 │ bfloat16 │ 1 │ 4.000000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:58Z INFO 9072 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg02, addr_space=local (complete data located at nc00/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ _spill_1774 │ bfloat16 │ 1 │ 0.000008 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:58Z INFO 9072 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc00 (complete data located at nc00//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:58Z INFO 9072 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg02, addr_space=local (complete data located at nc01/sg02/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ _spill_1785 │ bfloat16 │ 3 │ 0.011719 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:58Z INFO 9072 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc01 (complete data located at nc01//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 8.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 0.500000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:58Z INFO 9072 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:38:59Z INFO 8698 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:38:59Z INFO 8698 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:38:59Z INFO 8698 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e +2025-11-04T21:38:59Z INFO 8698 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:38:59Z INFO 8698 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:38:59Z INFO 8698 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:38:59Z INFO 8698 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:38:59Z INFO 8698 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:38:59Z INFO 8698 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk4/neuronxcc-yihckw_e/hlo_netlist.json +2025-11-04T21:38:59Z INFO 8698 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:38:59Z INFO 8698 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:38:59Z INFO 8698 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:38:59Z INFO 8685 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk4/metaneff.pb b/context_encoding_model/_tp0_bk4/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..946fe251ca80a1c8d2844442377b3fe280b6f2a0 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2c9cfa0cd764e2b2f060557a0315ea75ce71a4875299aa863b7564b6f41b711 +size 3644060 diff --git a/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb b/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..8ecbff17a291195b9564c564762cbe8996502390 --- /dev/null +++ b/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c564e37d09483fd3fa5207db2f0d41a54a9993b618c3243e9e641c74a7d8a5c +size 3730846 diff --git a/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff b/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff new file mode 100644 index 0000000000000000000000000000000000000000..0c6ac4143757fd762b2f565262967b22ebee8d1a --- /dev/null +++ b/context_encoding_model/_tp0_bk4/model.MODULE_95ef7ca73cc0a6161be2+96be3c33.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bfd67384e1a0c5645609060b8bfb6fc5cfe3dbbd75b7568508606e623f387d +size 1926144 diff --git a/context_encoding_model/_tp0_bk4/neuron_config.json b/context_encoding_model/_tp0_bk4/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b9517a38831e61607b556ade5a9ef1cd243b632b --- /dev/null +++ b/context_encoding_model/_tp0_bk4/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 2048 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 2048 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 4096, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/context_encoding_model/_tp0_bk5/command.txt b/context_encoding_model/_tp0_bk5/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e9c6420954188510dc3029c4a747dc267707d63 --- /dev/null +++ b/context_encoding_model/_tp0_bk5/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb --output model.MODULE_96a8f4e12dc810958634+b1e26cef.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk5/compile_flags.MODULE_96a8f4e12dc810958634+b1e26cef.json b/context_encoding_model/_tp0_bk5/compile_flags.MODULE_96a8f4e12dc810958634+b1e26cef.json new file mode 100644 index 0000000000000000000000000000000000000000..70566ab12cc7d689b1375493a105252dc8b1124d --- /dev/null +++ b/context_encoding_model/_tp0_bk5/compile_flags.MODULE_96a8f4e12dc810958634+b1e26cef.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma ", "--lnc=2", "-O1", "--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/log-neuron-cc.txt"] \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk5/global_metric_store.json b/context_encoding_model/_tp0_bk5/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..fb583e9b5402c4ba5d322f06dae22f51a718cbd0 --- /dev/null +++ b/context_encoding_model/_tp0_bk5/global_metric_store.json @@ -0,0 +1,1177 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 99.12728881835938, + "StaticProfiler::AveragePartitionUtilization": 95.96998596191406, + "StaticProfiler::AveragePeUtilization": 97.68225860595703, + "StaticProfiler::LocalizationEfficiency": 56.908729553222656, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 63.73067855834961, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1.0, + "StaticProfiler::AveragePartitionUtilization": 1.0, + "StaticProfiler::AveragePeUtilization": 1.0, + "StaticProfiler::LocalizationEfficiency": 1.0, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1.0 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 0.03893709182739258, + "AffinePredicateResolution": 0.00975942611694336, + "AliasDependencyElimination": 0.00020766258239746094, + "AliasDependencyInduction": 0.014848947525024414, + "AliasDependencyReset": 0.0507814884185791, + "BFComputeCutting": 0.004155397415161133, + "BirCodeGenLoop": 0.384446382522583, + "CCOpFusion": 0.11220550537109375, + "CanonicalizeConv": 1.8999999156221747e-05, + "CanonicalizeDAGForPGTiling": 0.013774633407592773, + "CanonicalizeForTensorizer": 5.0000002374872565e-05, + "CanonicalizeIR": 0.002764463424682617, + "Canonicalizer": 0.0008950000046752393, + "CoalesceCCOp": 0.01839923858642578, + "CommuteConcat": 0.0019075870513916016, + "DMALocalityOpt": 0.00996088981628418, + "DMAProfiler": 0.02422189712524414, + "DMATilingProfiler": 0.007188081741333008, + "DataLocalityOpt": 0.15634822845458984, + "DataStreaming": 0.03180813789367676, + "DeConcat": 0.0020532608032226563, + "DeadCodeElimination": 0.002146482467651367, + "DeadStoreElimination": 0.024139404296875, + "DelinearIndices": 0.013254880905151367, + "Delinearization": 0.007935047149658203, + "DelinearizeSPMD": 0.023029565811157227, + "DoNothing": 0.0005247592926025391, + "DramToDramTranspose": 0.012213945388793945, + "DumpGraphAndMetadata": 0.03455543518066406, + "EliminateDivs": 0.01893448829650879, + "ExpandBatchNorm": 0.007169485092163086, + "ExpandISAMacro": 0.019716739654541016, + "FactorizeBlkDims": 0.0747368335723877, + "FactorizeThreadAxesInFreeDims": 0.0075495243072509766, + "FlattenMacroLoop": 0.007609844207763672, + "GenericAccessSimplifier": 0.0013933181762695313, + "HoistCompute": 4.999999873689376e-06, + "IdentifyCrossPassTensors": 3.899999865097925e-05, + "InferInitValue": 0.10064125061035156, + "InferIntrinsicOnCC": 0.026311159133911133, + "InferNeuronTensor": 0.05008339881896973, + "InferNonlocalTensors": 0.05733203887939453, + "InferPSumTensor": 0.1221306324005127, + "InferShardAxis": 0.6304898262023926, + "InferSharedMemLoc": 0.0429539680480957, + "InlineNativeKernels": 0.00394749641418457, + "InsertCoreBarrier": 0.01845526695251465, + "InsertIOTransposes": 0.04183030128479004, + "InsertImplicitShardAxisBeforeISel": 0.01711416244506836, + "InsertLocalTransposes": 0.0077512264251708984, + "InsertOffloadedTransposes": 0.010181665420532227, + "LICM": 0.005186319351196289, + "LateLegalizeInst": 0.04364776611328125, + "LateLegalizePostSplit": 0.03845643997192383, + "LateLowerReshapeOp": 0.0019919872283935547, + "LateLowerTensorOp": 0.0022301673889160156, + "LateNeuronInstComb": 0.04980278015136719, + "LayoutPreprocessing": 0.05747699737548828, + "LayoutPreprocessingAndAnalysis": 0.09093403816223145, + "LayoutRequirementAnalysis": 0.010792970657348633, + "LegalizeCCOpLayout": 0.0032892227172851563, + "LegalizeOpLevelAlias": 0.0013661384582519531, + "LegalizePartitionReduce": 0.006167411804199219, + "LegalizeSundaAccess": 0.10145425796508789, + "LegalizeSundaMacro": 0.051756858825683594, + "LegalizeType": 0.07339167594909668, + "LocalLayoutOpt": 0.021276235580444336, + "LoopFusion": 0.006464719772338867, + "LoopSplitting": 0.0007054805755615234, + "LowerBroadcast": 0.01979851722717285, + "LowerCCOpBlockAxis": 0.008892297744750977, + "LowerComplexBroadcast": 0.0035398006439208984, + "LowerIntrinsics": 0.05094194412231445, + "LowerShardAxis": 0.04483389854431152, + "LowerTensorOp": 0.025528907775878906, + "LowerToSendRecv": 0.04537153244018555, + "LowerTranspose": 0.040845394134521484, + "MacroGeneration": 0.08503556251525879, + "MaskPropagation": 0.007714748382568359, + "MemcastMotion": 1.9999999494757503e-05, + "MemcpyElimination": 0.062020301818847656, + "MutateDataType": 0.0020122528076171875, + "NeuronAliasDependencyInduction": 0.0006520748138427734, + "NeuronAliasDependencyReset": 0.10503625869750977, + "NeuronInstComb": 0.057951927185058594, + "NeuronLICM": 0.05489492416381836, + "NeuronLoopFusion": 0.05422854423522949, + "NeuronLoopInterchange": 0.0029349327087402344, + "NeuronSimplifier": 0.026484966278076172, + "NeuronSimplifyPredicates": 0.04440903663635254, + "NeuronValueNumbering": 0.02174234390258789, + "OptimizeAliasedCopyChain": 0.0018880367279052734, + "OptimizeNKIKernels": 4.115047454833984, + "PAGLayoutOpt": 0.11529350280761719, + "PComputeCutting": 0.010918140411376953, + "PGLayoutTilingPipeline": 1.6512439250946045, + "PGTiling": 0.2841973304748535, + "PadElimination": 0.0008590221405029297, + "ParAxesAnnotation": 0.07899093627929688, + "PartialLoopFusion": 0.03534102439880371, + "PartialSimdFusion": 0.021408557891845703, + "PenguinizeFunctions": 4.70000013592653e-05, + "PerfectLoopNest": 0.008621454238891602, + "PruneFunctions": 4.70000013592653e-05, + "RecognizeOpIdiom": 0.010253190994262695, + "Recompute": 0.0005791187286376953, + "RelaxPredicates": 0.013797521591186523, + "Rematerialization": 0.0054569244384765625, + "RemoveOptimizationBarriers": 4.5000000682193786e-05, + "RemoveShardedPartitionAxes": 0.03261446952819824, + "ReshapeWeights": 0.001524209976196289, + "ResolveAccessConflict": 0.019870281219482422, + "ResolveComplicatePredicates": 0.0053920745849609375, + "RewriteReplicationMatmul": 0.0025107860565185547, + "RewriteWeights": 0.009802579879760742, + "SFKVectorizer": 0.3575756549835205, + "ScatterMotion": 3.899999865097925e-05, + "ShardingPropagationAnalysis": 0.10757136344909668, + "SimpleAllReduceTiling": 0.015942096710205078, + "Simplifier": 0.005366325378417969, + "SimplifyMacroPredicates": 0.016243934631347656, + "SimplifyNeuronTensor": 0.16969990730285645, + "SimplifySlice": 0.002231597900390625, + "SimplifyTensor": 0.017529726028442383, + "SpillPSum": 0.20494413375854492, + "SplitAPUnionSets": 0.15779972076416016, + "SplitAccGrp": 0.005539894104003906, + "StaticProfiler": 0.046514272689819336, + "StaticTransposeLocalTensor": 0.008464574813842773, + "SundaISel": 0.07130837440490723, + "TCTransform": 0.002462148666381836, + "TensorInitialization": 0.011480093002319336, + "TensorOpSimplifier": 0.008947134017944336, + "TensorOpTransform": 0.06947088241577148, + "TensorizerLegalizationPass": 5.699999746866524e-05, + "TileCCOps": 0.012774467468261719, + "TilingProfiler": 0.014863967895507813, + "TransformConvOp": 0.006424665451049805, + "TritiumFusion": 0.11082077026367188, + "ValueNumbering": 0.0049648284912109375, + "VectorizeDMA": 0.004624843597412109, + "VectorizeMatMult": 0.028928518295288086, + "VerifySupportedOps": 3.899999865097925e-05, + "WeightCoalescing": 0.013041973114013672, + "ZeroSizeTensorElimination": 0.00021529197692871094, + "algsimp": 0.001961000030860305, + "batchnorm_expander": 3.5000000934815034e-05, + "boundary-marker-removal": 1.1000000085914508e-05, + "call-inliner": 0.0003279999946244061, + "canonicalize-boundary-marker": 1.3999999282532372e-05, + "collective-stream-id-checker": 9.40000027185306e-05, + "comparison-expander": 0.000506000011228025, + "computation-deduplicator": 5.499999679159373e-05, + "config-lowering": 0.00011899999663000926, + "constant-statistics": 0.0004400000034365803, + "constant_folding": 0.00030700000934302807, + "cse": 3.5000000934815034e-05, + "dce": 7.700000423938036e-05, + "dot_decomposer": 0.0009110000100918114, + "dynamic-slice-transpose": 1.1000000085914508e-05, + "eliminate-redundant-compare": 0.00028899998869746923, + "emit-offloaded-dropout": 5.6000000768108293e-05, + "flatten-call-graph": 0.0006600000197067857, + "fuse-send-recv": 5.5999997130129486e-05, + "hilo-conditional-to-select": 1.3999999282532372e-05, + "hilo::LegalizeAlias": 1.2000000424450263e-05, + "hilo::NeuronInstCombine": 0.00012000000424450263, + "hilo::NeuronOpFusion": 1.4999999621068127e-05, + "hilo::ReplaceTokenTypeWithU8Pass": 3.599999763537198e-05, + "hilo::ScheduleFusion": 4.999999873689376e-06, + "hilo::SixtyFourHack": 9.899999713525176e-05, + "hilo::VerifyAliasing": 6.000000212225132e-06, + "hlo-mac-count": 0.012987000867724419, + "instruction-histogram": 0.0007619999814778566, + "io-con-pipe-begin": 7.000000096013537e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001361000002361834, + "io-statistics": 5.6000000768108293e-05, + "legalize-ccops-for-tensorizer": 3.000000106112566e-06, + "legalize-compare": 1.1000000085914508e-05, + "lower-argminmax-custom-call": 9.999999747378752e-06, + "map-inline": 0.0008399999933317304, + "metadata-naming": 4.400000034365803e-05, + "mlir::detail::OpToOpPassAdaptor": 5.7999997807201e-05, + "mlir::hlo::MhloToPyPenguin": 0.010812999680638313, + "mlir::mhlo::LowerComplexExtraPass": 0.0003440000000409782, + "mlir::mhlo::LowerComplexPass": 0.0004799999878741801, + "native-to-custom-softmax": 0.00035600000410340726, + "native-to-custom-softmax-dx": 0.0006880000000819564, + "neuron-hlo-verifier": 0.01168300025165081, + "operand_upcaster": 5.5999997130129486e-05, + "opt-barrier-removal": 0.0003150000120513141, + "post-par-pipe-begin": 1.4000000192027073e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0015290000010281801, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.09849400073289871, + "replace-minimum-constant": 0.0004569999873638153, + "reshape-mover": 0.0001030000057653524, + "simplify-concat": 0.00010699999984353781, + "simplify-while-loops": 7.700000423938036e-05, + "transform-variadic-reduce": 5.8999998145736754e-05, + "tuple-simplifier": 0.00025900002219714224, + "unpack-nested-aws-ntwsr": 0.00023599999258294702, + "unroll-while-loop": 1.2000000424450263e-05, + "zero_sized_hlo_elimination": 0.000783999974373728 + }, + "hilo": { + "ConstantSize": 7348863.0, + "HloInputCount": 371.0, + "HloMacCount": 240674799616.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910944768.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 1088551040.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 31232.0, + "StaticProfiler::AifUb": 538.6357421875, + "StaticProfiler::ArithmeticIntensityTensorizer": 306.53076171875, + "StaticProfiler::AverageDmaLength": 2517.368896484375, + "StaticProfiler::DDRTransferBytes": 672177216.0, + "StaticProfiler::InternalTransferBytes": 407820064.0, + "StaticProfiler::LoadExpanded": 189029.0, + "StaticProfiler::StoreExpanded": 13673.0, + "StaticProfiler::TotalDMAExpanded": 202702.0, + "StaticProfiler::TotalDynamicInstancesCount": 37700.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 37249.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 18720.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 11041.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 18.0, + "TilingProfiler::SimdInstructionsAfterTiling": 604.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "all": { + "compiletime": { + "algsimp": 0.0018090000376105309, + "call-inliner": 0.0003000000142492354, + "collective-stream-id-checker": 8.499999967170879e-05, + "comparison-expander": 0.0004920000210404396, + "constant-statistics": 0.0004400000034365803, + "constant_folding": 0.00028300000121816993, + "dce": 7.400000322377309e-05, + "dot_decomposer": 0.0009110000100918114, + "eliminate-redundant-compare": 0.0002789999998640269, + "flatten-call-graph": 0.0006380000268109143, + "hlo-mac-count": 0.007658000104129314, + "instruction-histogram": 0.0007619999814778566, + "io-con-pipe-begin": 7.000000096013537e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001361000002361834, + "io-statistics": 5.6000000768108293e-05, + "map-inline": 0.0008089999901130795, + "native-to-custom-softmax": 0.000307999987853691, + "native-to-custom-softmax-dx": 0.0004140000091865659, + "neuron-hlo-verifier": 0.010607999749481678, + "opt-barrier-removal": 0.0003150000120513141, + "pre-par-pipe-begin": 9.999999974752427e-07, + "pre-par-pipe-end": 0.0, + "pre-partition-simplification": 0.09849400073289871, + "replace-minimum-constant": 0.00043899999582208693, + "reshape-mover": 9.500000305706635e-05, + "simplify-while-loops": 7.100000220816582e-05, + "tuple-simplifier": 0.0002460000105202198, + "unpack-nested-aws-ntwsr": 0.00022600000374950469, + "unroll-while-loop": 1.2000000424450263e-05, + "zero_sized_hlo_elimination": 0.000783999974373728 + } + }, + "attention_isa_kernel": { + "compiletime": { + "CoalesceCCOp": 0.00021338462829589844, + "DMALocalityOpt": 0.0002186298370361328, + "DMAProfiler": 0.00027632713317871094, + "DataStreaming": 0.00021409988403320313, + "DoNothing": 0.0033321380615234375, + "ExpandISAMacro": 0.00029921531677246094, + "FactorizeBlkDims": 0.000396728515625, + "InferPSumTensor": 0.0006210803985595703, + "InferSharedMemLoc": 0.0006666183471679688, + "InsertCoreBarrier": 0.00035572052001953125, + "LateLegalizeInst": 0.00023174285888671875, + "LateNeuronInstComb": 0.000492095947265625, + "LegalizeSundaAccess": 0.0002181529998779297, + "LegalizeType": 0.0002846717834472656, + "LowerBroadcast": 0.00025916099548339844, + "LowerIntrinsics": 0.00029730796813964844, + "LowerTranspose": 0.0002589225769042969, + "NeuronInstComb": 0.000469207763671875, + "NeuronLICM": 0.00020599365234375, + "NeuronSimplifyPredicates": 0.0002067089080810547, + "NeuronValueNumbering": 0.0002777576446533203, + "SFKVectorizer": 0.0018928050994873047, + "SimpleAllReduceTiling": 0.00020241737365722656, + "SimplifyNeuronTensor": 0.0006334781646728516, + "SpillPSum": 0.0007383823394775391, + "WeightCoalescing": 0.00025081634521484375 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0003447532653808594, + "DMALocalityOpt": 0.0003294944763183594, + "DMAProfiler": 0.0012810230255126953, + "DataStreaming": 0.0005331039428710938, + "DoNothing": 0.00017762184143066406, + "ExpandISAMacro": 0.0009202957153320313, + "FactorizeBlkDims": 0.0006163120269775391, + "InferPSumTensor": 0.0011057853698730469, + "InferSharedMemLoc": 0.0004899501800537109, + "InsertCoreBarrier": 0.0004894733428955078, + "LateLegalizeInst": 0.0006704330444335938, + "LateNeuronInstComb": 0.0013632774353027344, + "LegalizeSundaAccess": 0.0025315284729003906, + "LegalizeType": 0.00039649009704589844, + "LowerBroadcast": 0.0004820823669433594, + "LowerIntrinsics": 0.0004119873046875, + "LowerTranspose": 0.0004839897155761719, + "NeuronInstComb": 0.0013201236724853516, + "NeuronLICM": 0.0006861686706542969, + "NeuronSimplifyPredicates": 0.011016607284545898, + "NeuronValueNumbering": 0.0007073879241943359, + "SFKVectorizer": 0.012517213821411133, + "SimpleAllReduceTiling": 0.0003895759582519531, + "SimplifyNeuronTensor": 0.0022177696228027344, + "SpillPSum": 0.0009493827819824219, + "WeightCoalescing": 0.00035071372985839844 + } + }, + "sg00": { + "compiletime": { + "CanonicalizeConv": 1.2999999853491317e-05, + "CanonicalizeForTensorizer": 1.8000000636675395e-05, + "Canonicalizer": 0.00029600001289509237, + "HoistCompute": 3.000000106112566e-06, + "IdentifyCrossPassTensors": 1.4000000192027073e-05, + "MemcastMotion": 7.999999979801942e-06, + "PenguinizeFunctions": 1.8000000636675395e-05, + "PruneFunctions": 3.000000106112566e-06, + "RemoveOptimizationBarriers": 1.700000029813964e-05, + "ScatterMotion": 1.700000029813964e-05, + "TensorizerLegalizationPass": 2.9000000722589903e-05, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 4.8000001697801054e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 7.000000096013537e-06, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 3.999999989900971e-06, + "computation-deduplicator": 1.5999999959603883e-05, + "config-lowering": 4.3000000005122274e-05, + "constant_folding": 7.000000096013537e-06, + "cse": 1.1000000085914508e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 3.199999991920777e-05, + "flatten-call-graph": 7.000000096013537e-06, + "fuse-send-recv": 1.8000000636675395e-05, + "hilo-conditional-to-select": 3.999999989900971e-06, + "hilo::LegalizeAlias": 4.999999873689376e-06, + "hilo::NeuronInstCombine": 4.8999998398358e-05, + "hilo::NeuronOpFusion": 4.999999873689376e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.4000000192027073e-05, + "hilo::ScheduleFusion": 1.9999999949504854e-06, + "hilo::SixtyFourHack": 1.5999999959603883e-05, + "hilo::VerifyAliasing": 3.000000106112566e-06, + "hlo-mac-count": 8.299999899463728e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 9.999999747378752e-06, + "metadata-naming": 1.2999999853491317e-05, + "mlir::detail::OpToOpPassAdaptor": 1.9999999494757503e-05, + "mlir::hlo::MhloToPyPenguin": 0.0010160000529140234, + "mlir::mhlo::LowerComplexExtraPass": 0.00013899999612476677, + "mlir::mhlo::LowerComplexPass": 0.0002699999895412475, + "native-to-custom-softmax": 3.7999998312443495e-05, + "native-to-custom-softmax-dx": 0.00024399999529123306, + "neuron-hlo-verifier": 0.0003870000073220581, + "operand_upcaster": 1.700000029813964e-05, + "post-par-pipe-begin": 1.2000000424450263e-05, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005039999959990382, + "replace-minimum-constant": 4.999999873689376e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 3.300000025774352e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.000000096013537e-06, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 183.30274963378906, + "ConstantSize": 7348863.0, + "HloInputCount": 371.0, + "HloMacCount": 42949672960.0, + "HloOutputCount": 57.0, + "IfmapSize": 3910944768.0, + "OfmapSize": 1879048192.0, + "OutputsReadFromCount": 0.0, + "PassthroughTensorsCount": 0.0, + "RedundantOutputCount": 0.0, + "Traffic": 468620064.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 0.04064464569091797, + "AffinePredicateResolution": 0.0019383430480957031, + "AliasDependencyElimination": 0.00022459030151367188, + "AliasDependencyInduction": 0.019460439682006836, + "AliasDependencyReset": 0.04814887046813965, + "BFComputeCutting": 0.0055506229400634766, + "BirCodeGenLoop": 0.13215899467468262, + "CCOpFusion": 0.11969184875488281, + "CanonicalizeDAGForPGTiling": 0.0033049583435058594, + "CanonicalizeIR": 0.0060040950775146484, + "CoalesceCCOp": 0.0054624080657958984, + "CommuteConcat": 0.002767324447631836, + "DMALocalityOpt": 0.0027179718017578125, + "DMAProfiler": 0.01582622528076172, + "DMATilingProfiler": 0.008585929870605469, + "DataLocalityOpt": 0.2429823875427246, + "DataStreaming": 0.013686180114746094, + "DeConcat": 0.0028448104858398438, + "DeadCodeElimination": 0.00874471664428711, + "DeadStoreElimination": 0.07823586463928223, + "DelinearIndices": 0.01836085319519043, + "Delinearization": 0.009904146194458008, + "DelinearizeSPMD": 0.03007340431213379, + "DoNothing": 9.870529174804688e-05, + "DramToDramTranspose": 0.014807701110839844, + "DumpGraphAndMetadata": 0.00868082046508789, + "EliminateDivs": 0.005564212799072266, + "ExpandBatchNorm": 0.0029854774475097656, + "ExpandISAMacro": 0.006433963775634766, + "FactorizeBlkDims": 0.06867551803588867, + "FactorizeThreadAxesInFreeDims": 0.008321523666381836, + "FlattenMacroLoop": 0.006778717041015625, + "GenericAccessSimplifier": 0.0014896392822265625, + "InferInitValue": 0.06406569480895996, + "InferIntrinsicOnCC": 0.022037982940673828, + "InferNeuronTensor": 0.06763529777526855, + "InferNonlocalTensors": 0.22275519371032715, + "InferPSumTensor": 0.15494084358215332, + "InferShardAxis": 0.5209276676177979, + "InferSharedMemLoc": 0.017581939697265625, + "InlineNativeKernels": 0.007895946502685547, + "InsertCoreBarrier": 0.014360427856445313, + "InsertIOTransposes": 0.026629209518432617, + "InsertImplicitShardAxisBeforeISel": 0.018111467361450195, + "InsertLocalTransposes": 0.02471637725830078, + "InsertOffloadedTransposes": 0.018056154251098633, + "LICM": 0.006089210510253906, + "LateLegalizeInst": 0.020943164825439453, + "LateLegalizePostSplit": 0.01616668701171875, + "LateLowerReshapeOp": 0.004019498825073242, + "LateLowerTensorOp": 0.014237642288208008, + "LateNeuronInstComb": 0.02029895782470703, + "LayoutPreprocessing": 0.09618091583251953, + "LayoutPreprocessingAndAnalysis": 0.1460561752319336, + "LayoutRequirementAnalysis": 0.01375579833984375, + "LegalizeCCOpLayout": 0.004752159118652344, + "LegalizeOpLevelAlias": 0.001943826675415039, + "LegalizePartitionReduce": 0.002205371856689453, + "LegalizeSundaAccess": 0.08727788925170898, + "LegalizeSundaMacro": 0.017870187759399414, + "LegalizeType": 0.01916980743408203, + "LocalLayoutOpt": 0.049512386322021484, + "LoopFusion": 0.012260913848876953, + "LoopSplitting": 0.0006864070892333984, + "LowerBroadcast": 0.006807088851928711, + "LowerCCOpBlockAxis": 0.007787466049194336, + "LowerComplexBroadcast": 0.004546642303466797, + "LowerIntrinsics": 0.04405355453491211, + "LowerShardAxis": 0.033060312271118164, + "LowerTensorOp": 0.026821613311767578, + "LowerToSendRecv": 0.011995553970336914, + "LowerTranspose": 0.02594161033630371, + "MacroGeneration": 0.11522269248962402, + "MaskPropagation": 0.003435373306274414, + "MemcpyElimination": 0.2497720718383789, + "MutateDataType": 0.0027208328247070313, + "NeuronAliasDependencyInduction": 0.002033233642578125, + "NeuronAliasDependencyReset": 0.07921051979064941, + "NeuronInstComb": 0.018134593963623047, + "NeuronLICM": 0.037050724029541016, + "NeuronLoopFusion": 0.037982940673828125, + "NeuronLoopInterchange": 0.0038917064666748047, + "NeuronSimplifier": 0.022843360900878906, + "NeuronSimplifyPredicates": 0.003104686737060547, + "NeuronValueNumbering": 0.009130239486694336, + "OptimizeAliasedCopyChain": 0.004662990570068359, + "OptimizeNKIKernels": 0.3685793876647949, + "PAGLayoutOpt": 0.6570594310760498, + "PComputeCutting": 0.012747764587402344, + "PGLayoutTilingPipeline": 2.4684011936187744, + "PGTiling": 0.4522573947906494, + "PadElimination": 0.005415916442871094, + "ParAxesAnnotation": 0.5855293273925781, + "PartialLoopFusion": 0.06675910949707031, + "PartialSimdFusion": 0.07990288734436035, + "PerfectLoopNest": 0.004445075988769531, + "RecognizeOpIdiom": 0.02440333366394043, + "Recompute": 0.0006387233734130859, + "RelaxPredicates": 0.0069468021392822266, + "Rematerialization": 0.011609554290771484, + "RemoveShardedPartitionAxes": 0.029452085494995117, + "ReshapeWeights": 0.0011801719665527344, + "ResolveAccessConflict": 0.012258052825927734, + "ResolveComplicatePredicates": 0.0021598339080810547, + "RewriteReplicationMatmul": 0.0023620128631591797, + "RewriteWeights": 0.005594730377197266, + "SFKVectorizer": 0.6774003505706787, + "ShardingPropagationAnalysis": 0.07418251037597656, + "SimpleAllReduceTiling": 0.011443138122558594, + "Simplifier": 0.006997346878051758, + "SimplifyMacroPredicates": 0.010604381561279297, + "SimplifyNeuronTensor": 0.026854515075683594, + "SimplifySlice": 0.0022373199462890625, + "SimplifyTensor": 0.013662576675415039, + "SpillPSum": 0.04489874839782715, + "SplitAPUnionSets": 0.09562921524047852, + "SplitAccGrp": 0.0030364990234375, + "StaticProfiler": 0.02321648597717285, + "StaticTransposeLocalTensor": 0.004773139953613281, + "SundaISel": 0.08316183090209961, + "TCTransform": 0.0036308765411376953, + "TensorInitialization": 0.008217096328735352, + "TensorOpSimplifier": 0.013900995254516602, + "TensorOpTransform": 0.04661202430725098, + "TileCCOps": 0.03966546058654785, + "TilingProfiler": 0.02010059356689453, + "TransformConvOp": 0.00817561149597168, + "TritiumFusion": 0.12114953994750977, + "ValueNumbering": 0.01564812660217285, + "VectorizeDMA": 0.007418394088745117, + "VectorizeMatMult": 0.042043209075927734, + "WeightCoalescing": 0.008504390716552734, + "ZeroSizeTensorElimination": 0.0001614093780517578 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 6983.0, + "StaticProfiler::AifUb": 127.67816925048828, + "StaticProfiler::ArithmeticIntensityTensorizer": 265.79534912109375, + "StaticProfiler::AverageDmaLength": 2094.913818359375, + "StaticProfiler::AverageFractalPeUtilization": 99.83814239501953, + "StaticProfiler::AveragePartitionUtilization": 99.57943725585938, + "StaticProfiler::AveragePeUtilization": 99.35083770751953, + "StaticProfiler::DDRTransferBytes": 237259264.0, + "StaticProfiler::InternalTransferBytes": 225476608.0, + "StaticProfiler::LoadExpanded": 36391.0, + "StaticProfiler::LocalizationEfficiency": 208.176025390625, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 340.6685791015625, + "StaticProfiler::StoreExpanded": 27137.0, + "StaticProfiler::TotalDMAExpanded": 63528.0, + "StaticProfiler::TotalDynamicInstancesCount": 10455.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 10430.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 384.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 3104.0, + "TilingProfiler::NumPfTransposes": 7.0, + "TilingProfiler::NumPfTransposesForIo": 0.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 4.0, + "TilingProfiler::PfTransposeInstructions": 1792.0, + "TilingProfiler::PfTransposeInstructionsForIo": 0.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 512.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1280.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 932.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0001": { + "compiletime": { + "AGOrderingAnalysisPass": 0.09063863754272461, + "AffinePredicateResolution": 0.0031011104583740234, + "AliasDependencyElimination": 0.000255584716796875, + "AliasDependencyInduction": 0.012615680694580078, + "AliasDependencyReset": 0.04242563247680664, + "BFComputeCutting": 0.005561113357543945, + "BirCodeGenLoop": 0.09979081153869629, + "CCOpFusion": 0.1346728801727295, + "CanonicalizeDAGForPGTiling": 0.012668848037719727, + "CanonicalizeIR": 0.005399465560913086, + "CoalesceCCOp": 0.007870197296142578, + "CommuteConcat": 0.002213716506958008, + "DMALocalityOpt": 0.008012056350708008, + "DMAProfiler": 0.017035484313964844, + "DMATilingProfiler": 0.014662027359008789, + "DataLocalityOpt": 0.35089898109436035, + "DataStreaming": 0.0234222412109375, + "DeConcat": 0.00548243522644043, + "DeadCodeElimination": 0.010943174362182617, + "DeadStoreElimination": 0.037809133529663086, + "DelinearIndices": 0.028621196746826172, + "Delinearization": 0.0106201171875, + "DelinearizeSPMD": 0.029047727584838867, + "DoNothing": 0.00011301040649414063, + "DramToDramTranspose": 0.01769733428955078, + "DumpGraphAndMetadata": 0.013274908065795898, + "EliminateDivs": 0.006105184555053711, + "ExpandBatchNorm": 0.0027565956115722656, + "ExpandISAMacro": 0.01057887077331543, + "FactorizeBlkDims": 0.06908917427062988, + "FactorizeThreadAxesInFreeDims": 0.00501704216003418, + "FlattenMacroLoop": 0.01100611686706543, + "GenericAccessSimplifier": 0.0046689510345458984, + "InferInitValue": 0.07929110527038574, + "InferIntrinsicOnCC": 0.03535032272338867, + "InferNeuronTensor": 0.07708048820495605, + "InferNonlocalTensors": 0.09707069396972656, + "InferPSumTensor": 0.0996854305267334, + "InferShardAxis": 0.6792669296264648, + "InferSharedMemLoc": 0.009181737899780273, + "InlineNativeKernels": 0.0036575794219970703, + "InsertCoreBarrier": 0.015471458435058594, + "InsertIOTransposes": 0.04584240913391113, + "InsertImplicitShardAxisBeforeISel": 0.008542537689208984, + "InsertLocalTransposes": 0.029177427291870117, + "InsertOffloadedTransposes": 0.01767134666442871, + "LICM": 0.007311820983886719, + "LateLegalizeInst": 0.021373271942138672, + "LateLegalizePostSplit": 0.013000011444091797, + "LateLowerReshapeOp": 0.002672910690307617, + "LateLowerTensorOp": 0.022157907485961914, + "LateNeuronInstComb": 0.038089752197265625, + "LayoutPreprocessing": 0.0897824764251709, + "LayoutPreprocessingAndAnalysis": 0.140883207321167, + "LayoutRequirementAnalysis": 0.011104106903076172, + "LegalizeCCOpLayout": 0.0038611888885498047, + "LegalizeOpLevelAlias": 0.005839109420776367, + "LegalizePartitionReduce": 0.0055887699127197266, + "LegalizeSundaAccess": 0.053086042404174805, + "LegalizeSundaMacro": 0.020623445510864258, + "LegalizeType": 0.009373188018798828, + "LocalLayoutOpt": 0.07568526268005371, + "LoopFusion": 0.03827691078186035, + "LoopSplitting": 0.0006964206695556641, + "LowerBroadcast": 0.0038139820098876953, + "LowerCCOpBlockAxis": 0.015240907669067383, + "LowerComplexBroadcast": 0.00460052490234375, + "LowerIntrinsics": 0.06653690338134766, + "LowerShardAxis": 0.034250497817993164, + "LowerTensorOp": 0.024506807327270508, + "LowerToSendRecv": 0.00830531120300293, + "LowerTranspose": 0.026538848876953125, + "MacroGeneration": 0.1462860107421875, + "MaskPropagation": 0.004972219467163086, + "MemcpyElimination": 0.17155957221984863, + "MutateDataType": 0.0026092529296875, + "NeuronAliasDependencyInduction": 0.0009496212005615234, + "NeuronAliasDependencyReset": 0.029055118560791016, + "NeuronInstComb": 0.010199785232543945, + "NeuronLICM": 0.02064967155456543, + "NeuronLoopFusion": 0.045073747634887695, + "NeuronLoopInterchange": 0.004991292953491211, + "NeuronSimplifier": 0.04068398475646973, + "NeuronSimplifyPredicates": 0.012614011764526367, + "NeuronValueNumbering": 0.008387327194213867, + "OptimizeAliasedCopyChain": 0.004460334777832031, + "OptimizeNKIKernels": 0.3194434642791748, + "PAGLayoutOpt": 0.48951292037963867, + "PComputeCutting": 0.014848470687866211, + "PGLayoutTilingPipeline": 2.5451276302337646, + "PGTiling": 0.5836856365203857, + "PadElimination": 0.000995635986328125, + "ParAxesAnnotation": 0.40463972091674805, + "PartialLoopFusion": 0.06643557548522949, + "PartialSimdFusion": 0.13411688804626465, + "PerfectLoopNest": 0.0027947425842285156, + "RecognizeOpIdiom": 0.01806020736694336, + "Recompute": 0.0004432201385498047, + "RelaxPredicates": 0.009535789489746094, + "Rematerialization": 0.008739471435546875, + "RemoveShardedPartitionAxes": 0.0267181396484375, + "ReshapeWeights": 0.0024602413177490234, + "ResolveAccessConflict": 0.00865793228149414, + "ResolveComplicatePredicates": 0.007423877716064453, + "RewriteReplicationMatmul": 0.003094196319580078, + "RewriteWeights": 0.008661746978759766, + "SFKVectorizer": 0.5552070140838623, + "ShardingPropagationAnalysis": 0.07864713668823242, + "SimpleAllReduceTiling": 0.009680747985839844, + "Simplifier": 0.010446548461914063, + "SimplifyMacroPredicates": 0.012853145599365234, + "SimplifyNeuronTensor": 0.025235891342163086, + "SimplifySlice": 0.001861572265625, + "SimplifyTensor": 0.017523765563964844, + "SpillPSum": 0.09313821792602539, + "SplitAPUnionSets": 0.07895660400390625, + "SplitAccGrp": 0.0044307708740234375, + "StaticProfiler": 0.014701604843139648, + "StaticTransposeLocalTensor": 0.008467674255371094, + "SundaISel": 0.07091832160949707, + "TCTransform": 0.0018222332000732422, + "TensorInitialization": 0.008383989334106445, + "TensorOpSimplifier": 0.013144254684448242, + "TensorOpTransform": 0.17133593559265137, + "TileCCOps": 0.018372297286987305, + "TilingProfiler": 0.022103309631347656, + "TransformConvOp": 0.00668644905090332, + "TritiumFusion": 0.25888824462890625, + "ValueNumbering": 0.00537419319152832, + "VectorizeDMA": 0.018125534057617188, + "VectorizeMatMult": 0.04329061508178711, + "WeightCoalescing": 0.006384849548339844, + "ZeroSizeTensorElimination": 0.00020265579223632813 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 16532.0, + "StaticProfiler::AifUb": 911.9026489257813, + "StaticProfiler::ArithmeticIntensityTensorizer": 525.61767578125, + "StaticProfiler::AverageDmaLength": 2890.18798828125, + "StaticProfiler::AverageFractalPeUtilization": 100.0, + "StaticProfiler::AveragePartitionUtilization": 99.69086456298828, + "StaticProfiler::AveragePeUtilization": 100.0, + "StaticProfiler::DDRTransferBytes": 440600576.0, + "StaticProfiler::InternalTransferBytes": 226492416.0, + "StaticProfiler::LoadExpanded": 92289.0, + "StaticProfiler::LocalizationEfficiency": 57.63966751098633, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 87.69190979003906, + "StaticProfiler::StoreExpanded": 26625.0, + "StaticProfiler::TotalDMAExpanded": 118914.0, + "StaticProfiler::TotalDynamicInstancesCount": 22001.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 22001.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 256.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 12288.0, + "TilingProfiler::NumPfTransposes": 8.0, + "TilingProfiler::NumPfTransposesForIo": 3.0, + "TilingProfiler::NumPfTransposesForLocal": 3.0, + "TilingProfiler::NumPfTransposesForNonlocal": 2.0, + "TilingProfiler::PfTransposeInstructions": 1984.0, + "TilingProfiler::PfTransposeInstructionsForIo": 576.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 384.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1024.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 0.0, + "TilingProfiler::SimdInstructionsAfterTiling": 1188.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg0002": { + "compiletime": { + "AGOrderingAnalysisPass": 0.03893709182739258, + "AffinePredicateResolution": 0.00975942611694336, + "AliasDependencyElimination": 0.00020766258239746094, + "AliasDependencyInduction": 0.014848947525024414, + "AliasDependencyReset": 0.0507814884185791, + "BFComputeCutting": 0.004155397415161133, + "BirCodeGenLoop": 0.384446382522583, + "CCOpFusion": 0.11220550537109375, + "CanonicalizeDAGForPGTiling": 0.013774633407592773, + "CanonicalizeIR": 0.002764463424682617, + "CoalesceCCOp": 0.003862142562866211, + "CommuteConcat": 0.0019075870513916016, + "DMALocalityOpt": 0.0027344226837158203, + "DMAProfiler": 0.009855031967163086, + "DMATilingProfiler": 0.007188081741333008, + "DataLocalityOpt": 0.15634822845458984, + "DataStreaming": 0.008760213851928711, + "DeConcat": 0.0020532608032226563, + "DeadCodeElimination": 0.002146482467651367, + "DeadStoreElimination": 0.024139404296875, + "DelinearIndices": 0.013254880905151367, + "Delinearization": 0.007935047149658203, + "DelinearizeSPMD": 0.023029565811157227, + "DoNothing": 0.0001049041748046875, + "DramToDramTranspose": 0.012213945388793945, + "DumpGraphAndMetadata": 0.03455543518066406, + "EliminateDivs": 0.01893448829650879, + "ExpandBatchNorm": 0.007169485092163086, + "ExpandISAMacro": 0.007604122161865234, + "FactorizeBlkDims": 0.023853540420532227, + "FactorizeThreadAxesInFreeDims": 0.0075495243072509766, + "FlattenMacroLoop": 0.007609844207763672, + "GenericAccessSimplifier": 0.0013933181762695313, + "InferInitValue": 0.10064125061035156, + "InferIntrinsicOnCC": 0.026311159133911133, + "InferNeuronTensor": 0.05008339881896973, + "InferNonlocalTensors": 0.05733203887939453, + "InferPSumTensor": 0.0887153148651123, + "InferShardAxis": 0.6304898262023926, + "InferSharedMemLoc": 0.03429460525512695, + "InlineNativeKernels": 0.00394749641418457, + "InsertCoreBarrier": 0.009274959564208984, + "InsertIOTransposes": 0.04183030128479004, + "InsertImplicitShardAxisBeforeISel": 0.01711416244506836, + "InsertLocalTransposes": 0.0077512264251708984, + "InsertOffloadedTransposes": 0.010181665420532227, + "LICM": 0.005186319351196289, + "LateLegalizeInst": 0.015667200088500977, + "LateLegalizePostSplit": 0.03845643997192383, + "LateLowerReshapeOp": 0.0019919872283935547, + "LateLowerTensorOp": 0.0022301673889160156, + "LateNeuronInstComb": 0.018993377685546875, + "LayoutPreprocessing": 0.05747699737548828, + "LayoutPreprocessingAndAnalysis": 0.09093403816223145, + "LayoutRequirementAnalysis": 0.010792970657348633, + "LegalizeCCOpLayout": 0.0032892227172851563, + "LegalizeOpLevelAlias": 0.0013661384582519531, + "LegalizePartitionReduce": 0.006167411804199219, + "LegalizeSundaAccess": 0.03937268257141113, + "LegalizeSundaMacro": 0.051756858825683594, + "LegalizeType": 0.023316621780395508, + "LocalLayoutOpt": 0.021276235580444336, + "LoopFusion": 0.006464719772338867, + "LoopSplitting": 0.0007054805755615234, + "LowerBroadcast": 0.011565208435058594, + "LowerCCOpBlockAxis": 0.008892297744750977, + "LowerComplexBroadcast": 0.0035398006439208984, + "LowerIntrinsics": 0.04290151596069336, + "LowerShardAxis": 0.04483389854431152, + "LowerTensorOp": 0.025528907775878906, + "LowerToSendRecv": 0.04537153244018555, + "LowerTranspose": 0.024749279022216797, + "MacroGeneration": 0.08503556251525879, + "MaskPropagation": 0.007714748382568359, + "MemcpyElimination": 0.062020301818847656, + "MutateDataType": 0.0020122528076171875, + "NeuronAliasDependencyInduction": 0.0006520748138427734, + "NeuronAliasDependencyReset": 0.10503625869750977, + "NeuronInstComb": 0.026773691177368164, + "NeuronLICM": 0.03244495391845703, + "NeuronLoopFusion": 0.05422854423522949, + "NeuronLoopInterchange": 0.0029349327087402344, + "NeuronSimplifier": 0.026484966278076172, + "NeuronSimplifyPredicates": 0.02537679672241211, + "NeuronValueNumbering": 0.005478858947753906, + "OptimizeAliasedCopyChain": 0.0018880367279052734, + "OptimizeNKIKernels": 4.115047454833984, + "PAGLayoutOpt": 0.11529350280761719, + "PComputeCutting": 0.010918140411376953, + "PGLayoutTilingPipeline": 1.6512439250946045, + "PGTiling": 0.2841973304748535, + "PadElimination": 0.0008590221405029297, + "ParAxesAnnotation": 0.07899093627929688, + "PartialLoopFusion": 0.03534102439880371, + "PartialSimdFusion": 0.021408557891845703, + "PerfectLoopNest": 0.008621454238891602, + "RecognizeOpIdiom": 0.010253190994262695, + "Recompute": 0.0005791187286376953, + "RelaxPredicates": 0.013797521591186523, + "Rematerialization": 0.0054569244384765625, + "RemoveShardedPartitionAxes": 0.03261446952819824, + "ReshapeWeights": 0.001524209976196289, + "ResolveAccessConflict": 0.019870281219482422, + "ResolveComplicatePredicates": 0.0053920745849609375, + "RewriteReplicationMatmul": 0.0025107860565185547, + "RewriteWeights": 0.009802579879760742, + "SFKVectorizer": 0.240997314453125, + "ShardingPropagationAnalysis": 0.10757136344909668, + "SimpleAllReduceTiling": 0.0035986900329589844, + "Simplifier": 0.005366325378417969, + "SimplifyMacroPredicates": 0.016243934631347656, + "SimplifyNeuronTensor": 0.016655683517456055, + "SimplifySlice": 0.002231597900390625, + "SimplifyTensor": 0.017529726028442383, + "SpillPSum": 0.03337574005126953, + "SplitAPUnionSets": 0.15779972076416016, + "SplitAccGrp": 0.005539894104003906, + "StaticProfiler": 0.046514272689819336, + "StaticTransposeLocalTensor": 0.008464574813842773, + "SundaISel": 0.07130837440490723, + "TCTransform": 0.002462148666381836, + "TensorInitialization": 0.011480093002319336, + "TensorOpSimplifier": 0.008947134017944336, + "TensorOpTransform": 0.06947088241577148, + "TileCCOps": 0.012774467468261719, + "TilingProfiler": 0.014863967895507813, + "TransformConvOp": 0.006424665451049805, + "TritiumFusion": 0.11082077026367188, + "ValueNumbering": 0.0049648284912109375, + "VectorizeDMA": 0.004624843597412109, + "VectorizeMatMult": 0.028928518295288086, + "WeightCoalescing": 0.003192901611328125, + "ZeroSizeTensorElimination": 0.00021529197692871094 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 31232.0, + "StaticProfiler::AifUb": 538.6357421875, + "StaticProfiler::ArithmeticIntensityTensorizer": 306.53076171875, + "StaticProfiler::AverageDmaLength": 2517.368896484375, + "StaticProfiler::AverageFractalPeUtilization": 99.12728881835938, + "StaticProfiler::AveragePartitionUtilization": 95.96998596191406, + "StaticProfiler::AveragePeUtilization": 97.68225860595703, + "StaticProfiler::DDRTransferBytes": 672177216.0, + "StaticProfiler::InternalTransferBytes": 407820064.0, + "StaticProfiler::LoadExpanded": 189029.0, + "StaticProfiler::LocalizationEfficiency": 56.908729553222656, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 63.73067855834961, + "StaticProfiler::StoreExpanded": 13673.0, + "StaticProfiler::TotalDMAExpanded": 202702.0, + "StaticProfiler::TotalDynamicInstancesCount": 37700.0, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 37249.0, + "StaticProfiler::TotalLNCComm": 0.0, + "StaticProfiler::TotalLNCCommTransfer": 0.0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0.0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0.0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0.0, + "TilingProfiler::DmaInstructionsAfterTiling": 0.0, + "TilingProfiler::GenericInstructionsAfterTiling": 4.0, + "TilingProfiler::MatMultInstructionsAfterTiling": 18720.0, + "TilingProfiler::NumPfTransposes": 5.0, + "TilingProfiler::NumPfTransposesForIo": 1.0, + "TilingProfiler::NumPfTransposesForLocal": 1.0, + "TilingProfiler::NumPfTransposesForNonlocal": 3.0, + "TilingProfiler::PfTransposeInstructions": 11041.0, + "TilingProfiler::PfTransposeInstructionsForIo": 9504.0, + "TilingProfiler::PfTransposeInstructionsForLocal": 1.0, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1536.0, + "TilingProfiler::ReduceInstructionsAfterTiling": 18.0, + "TilingProfiler::SimdInstructionsAfterTiling": 604.0, + "TilingProfiler::TotalInstructionsAfterTiling": 0.0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0.0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0.0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0.0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0.0, + "TransformConvOp::conv2d_column_packing": 0.0, + "TransformConvOp::conv2d_column_packing_1": 0.0, + "TransformConvOp::conv2d_column_packing_io10": 0.0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0.0 + } + }, + "sg01": { + "compiletime": { + "CanonicalizeConv": 6.000000212225132e-06, + "CanonicalizeForTensorizer": 1.2000000424450263e-05, + "Canonicalizer": 0.00033099998836405575, + "HoistCompute": 1.9999999949504854e-06, + "IdentifyCrossPassTensors": 1.4000000192027073e-05, + "MemcastMotion": 6.000000212225132e-06, + "PenguinizeFunctions": 1.2000000424450263e-05, + "PruneFunctions": 1.2999999853491317e-05, + "RemoveOptimizationBarriers": 1.8999999156221747e-05, + "ScatterMotion": 1.4000000192027073e-05, + "TensorizerLegalizationPass": 1.700000029813964e-05, + "VerifySupportedOps": 1.4999999621068127e-05, + "algsimp": 5.6000000768108293e-05, + "batchnorm_expander": 1.2000000424450263e-05, + "boundary-marker-removal": 3.999999989900971e-06, + "call-inliner": 1.1000000085914508e-05, + "canonicalize-boundary-marker": 4.999999873689376e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.9999999494757503e-05, + "config-lowering": 3.600000127335079e-05, + "constant_folding": 9.000000318337698e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.999999989900971e-06, + "eliminate-redundant-compare": 3.999999989900971e-06, + "emit-offloaded-dropout": 1.4000000192027073e-05, + "flatten-call-graph": 6.000000212225132e-06, + "fuse-send-recv": 2.2000000171829015e-05, + "hilo-conditional-to-select": 4.999999873689376e-06, + "hilo::LegalizeAlias": 3.999999989900971e-06, + "hilo::NeuronInstCombine": 4.70000013592653e-05, + "hilo::NeuronOpFusion": 7.999999979801942e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 1.4999999621068127e-05, + "hilo::ScheduleFusion": 9.999999974752427e-07, + "hilo::SixtyFourHack": 9.999999747378752e-06, + "hilo::VerifyAliasing": 1.9999999949504854e-06, + "hlo-mac-count": 8.800000068731606e-05, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.999999989900971e-06, + "lower-argminmax-custom-call": 3.999999989900971e-06, + "map-inline": 9.999999747378752e-06, + "metadata-naming": 1.700000029813964e-05, + "mlir::detail::OpToOpPassAdaptor": 1.700000029813964e-05, + "mlir::hlo::MhloToPyPenguin": 0.0024689999409019947, + "mlir::mhlo::LowerComplexExtraPass": 0.00012599999899975955, + "mlir::mhlo::LowerComplexPass": 0.0001630000042496249, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.2999999853491317e-05, + "neuron-hlo-verifier": 0.00035600000410340726, + "operand_upcaster": 2.5999999706982635e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0005549999768845737, + "replace-minimum-constant": 6.000000212225132e-06, + "reshape-mover": 3.000000106112566e-06, + "simplify-concat": 4.199999966658652e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 7.000000096013537e-06, + "tuple-simplifier": 4.999999873689376e-06, + "unpack-nested-aws-ntwsr": 3.000000106112566e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 1091.5736083984375, + "HloMacCount": 120259084288.0, + "Traffic": 220340768.0 + } + }, + "sg02": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 1.9999999494757503e-05, + "Canonicalizer": 0.0002680000034160912, + "HoistCompute": 0.0, + "IdentifyCrossPassTensors": 1.1000000085914508e-05, + "MemcastMotion": 6.000000212225132e-06, + "PenguinizeFunctions": 1.700000029813964e-05, + "PruneFunctions": 3.099999958067201e-05, + "RemoveOptimizationBarriers": 9.000000318337698e-06, + "ScatterMotion": 7.999999979801942e-06, + "TensorizerLegalizationPass": 1.1000000085914508e-05, + "VerifySupportedOps": 1.2000000424450263e-05, + "algsimp": 4.8000001697801054e-05, + "batchnorm_expander": 1.1000000085914508e-05, + "boundary-marker-removal": 3.000000106112566e-06, + "call-inliner": 9.999999747378752e-06, + "canonicalize-boundary-marker": 3.999999989900971e-06, + "collective-stream-id-checker": 3.000000106112566e-06, + "comparison-expander": 4.999999873689376e-06, + "computation-deduplicator": 1.8999999156221747e-05, + "config-lowering": 3.9999998989515007e-05, + "constant_folding": 7.999999979801942e-06, + "cse": 1.2000000424450263e-05, + "dce": 9.999999974752427e-07, + "dynamic-slice-transpose": 3.000000106112566e-06, + "eliminate-redundant-compare": 3.000000106112566e-06, + "emit-offloaded-dropout": 9.999999747378752e-06, + "flatten-call-graph": 9.000000318337698e-06, + "fuse-send-recv": 1.5999999959603883e-05, + "hilo-conditional-to-select": 4.999999873689376e-06, + "hilo::LegalizeAlias": 3.000000106112566e-06, + "hilo::NeuronInstCombine": 2.4000000848900527e-05, + "hilo::NeuronOpFusion": 1.9999999949504854e-06, + "hilo::ReplaceTokenTypeWithU8Pass": 7.000000096013537e-06, + "hilo::ScheduleFusion": 1.9999999949504854e-06, + "hilo::SixtyFourHack": 7.300000288523734e-05, + "hilo::VerifyAliasing": 9.999999974752427e-07, + "hlo-mac-count": 0.005158000160008669, + "legalize-ccops-for-tensorizer": 9.999999974752427e-07, + "legalize-compare": 3.000000106112566e-06, + "lower-argminmax-custom-call": 3.000000106112566e-06, + "map-inline": 1.1000000085914508e-05, + "metadata-naming": 1.4000000192027073e-05, + "mlir::detail::OpToOpPassAdaptor": 2.099999983329326e-05, + "mlir::hlo::MhloToPyPenguin": 0.007327999919652939, + "mlir::mhlo::LowerComplexExtraPass": 7.899999764049426e-05, + "mlir::mhlo::LowerComplexPass": 4.70000013592653e-05, + "native-to-custom-softmax": 4.999999873689376e-06, + "native-to-custom-softmax-dx": 1.700000029813964e-05, + "neuron-hlo-verifier": 0.0003319999959785491, + "operand_upcaster": 1.2999999853491317e-05, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.00046999999904073775, + "replace-minimum-constant": 7.000000096013537e-06, + "reshape-mover": 1.9999999949504854e-06, + "simplify-concat": 3.199999991920777e-05, + "simplify-while-loops": 1.9999999949504854e-06, + "transform-variadic-reduce": 4.5000000682193786e-05, + "tuple-simplifier": 3.999999989900971e-06, + "unpack-nested-aws-ntwsr": 3.999999989900971e-06, + "unroll-while-loop": 0.0 + }, + "hilo": { + "ArithmeticIntensity": 387.7274169921875, + "HloMacCount": 77466042368.0, + "Traffic": 399590208.0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.014192342758178711, + "DMALocalityOpt": 0.00689697265625, + "DMAProfiler": 0.01308584213256836, + "DataStreaming": 0.022514820098876953, + "DoNothing": 0.0002422332763671875, + "ExpandISAMacro": 0.01119232177734375, + "FactorizeBlkDims": 0.05026698112487793, + "InferPSumTensor": 0.032309532165527344, + "InferSharedMemLoc": 0.008169412612915039, + "InsertCoreBarrier": 0.008690834045410156, + "LateLegalizeInst": 0.02731013298034668, + "LateNeuronInstComb": 0.029446125030517578, + "LegalizeSundaAccess": 0.05955004692077637, + "LegalizeType": 0.04967856407165527, + "LowerBroadcast": 0.0077512264251708984, + "LowerIntrinsics": 0.007628440856933594, + "LowerTranspose": 0.015612125396728516, + "NeuronInstComb": 0.029858112335205078, + "NeuronLICM": 0.02176380157470703, + "NeuronSimplifyPredicates": 0.008015632629394531, + "NeuronValueNumbering": 0.015556097030639648, + "SFKVectorizer": 0.10406112670898438, + "SimpleAllReduceTiling": 0.01195383071899414, + "SimplifyNeuronTensor": 0.15082645416259766, + "SpillPSum": 0.17061901092529297, + "WeightCoalescing": 0.009498357772827148 + } + } +} \ No newline at end of file diff --git a/context_encoding_model/_tp0_bk5/graph.neff b/context_encoding_model/_tp0_bk5/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..829fed0d85d00a9196fb2d504e59a821afc81b1a --- /dev/null +++ b/context_encoding_model/_tp0_bk5/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56e28f3613a7ada8c1d580c4a0d3979da6436bd82072a724c52018668343c286 +size 3062784 diff --git a/context_encoding_model/_tp0_bk5/log-neuron-cc.txt b/context_encoding_model/_tp0_bk5/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..a70102fffe94f621cf23b83c4c9a0f219d0996c3 --- /dev/null +++ b/context_encoding_model/_tp0_bk5/log-neuron-cc.txt @@ -0,0 +1,10059 @@ +2025-11-04T21:38:35Z INFO 8743 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma' --lnc=2 -O1 '--internal-hlo2tensorizer-options= --modular-flow-mac-threshold=10 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:35Z INFO 8743 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:35Z INFO 8756 [root]: XLA detected +2025-11-04T21:38:35Z INFO 8756 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:35Z INFO 8756 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5 +2025-11-04T21:38:35Z INFO 8756 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:35Z INFO 8756 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:35Z INFO 8756 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:35Z INFO 8756 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --modular-flow-mac-threshold=10 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: +Pre-Partition Pre-Opt Histogram: +total HLO instructions: 8312 + reshape 1912 23.00% ################################################################ + broadcast 1123 13.51% ##################################### + transpose 1072 12.90% ################################### + convert 945 11.37% ############################### + constant 636 7.65% ##################### + parameter 371 4.46% ############ + slice 347 4.17% ########### + add 284 3.42% ######### + get-tuple-element 259 3.12% ######## + multiply 255 3.07% ######## + dot 198 2.38% ###### + call 174 2.09% ##### + compare 173 2.08% ##### + select 170 2.05% ##### + concatenate 116 1.40% ### + tuple 57 0.69% # + scatter 57 0.69% # + negate 56 0.67% # + all-reduce 56 0.67% # + divide 29 0.35% + gather 6 0.07% + iota 5 0.06% + all-gather 3 0.04% + reduce 3 0.04% + custom-call 2 0.02% + sine 1 0.01% + cosine 1 0.01% + maximum 1 0.01% + + +Pre-Partition Post-Op Histogram: +total HLO instructions: 5437 + reshape 1421 26.14% ################################################################ + transpose 817 15.03% #################################### + convert 720 13.24% ################################ + constant 443 8.15% ################### + parameter 371 6.82% ################ + broadcast 266 4.89% ########### + dot 197 3.62% ######## + custom-call 175 3.22% ####### + multiply 171 3.15% ####### + add 171 3.15% ####### + get-tuple-element 147 2.70% ###### + slice 115 2.12% ##### + concatenate 114 2.10% ##### + compare 59 1.09% ## + select 58 1.07% ## + scatter 57 1.05% ## + negate 56 1.03% ## + all-reduce 56 1.03% ## + gather 6 0.11% + all-gather 3 0.06% + iota 3 0.06% + reduce 3 0.06% + pad 2 0.04% + sine 1 0.02% + divide 1 0.02% + tuple 1 0.02% + maximum 1 0.02% + rng 1 0.02% + cosine 1 0.02% + +Potential split-points stats: #CC 59 #AR 56 #AG 3 #BN 0 nClamp 0 +ModuleSplitter initial partitioning... #parts 59 +ModuleSplitter initial partitioning... Done. + 0 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 3 2 57 58 +New disjoint wave: start 2 len 54 NumReps: 27 macs 3246995275776 +First non-zero-mac/used part from the end is 58 +Not enough zero-mac parts. skip +ModuleSplitter initial partitioning... #parts 29 +ModuleSplitter initial partitioning... Done. +Remat: gather-iota 0 matches, 0 ops rematted +Wrote HLO netlist to hlo_netlist.json +Wrote graph partitions in debug_info_hlo_partitions.json +Processing partition 0 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call dot gather get-tuple-element multiply negate parameter reshape scatter select sine slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 1 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-reduce broadcast compare concatenate constant convert custom-call dot get-tuple-element multiply negate parameter reshape scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass +Processing partition 2 +Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert custom-call divide dot gather get-tuple-element iota maximum multiply pad parameter reduce reshape rng scatter select slice transpose tuple +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: IR signature: 650714024bde78949bbf708ed9aa30ac4fbf0c6a16967a50e4a22fa28bf096b4 for sg0000/HLOToTensorizer +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: IR signature: 22a373e9c3dc227c7d4be94a3c5b3457cc7353a9e293443638cdf2bbefdf2a3d for sg0001/HLOToTensorizer +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: IR signature: b61b3fef8df077ac8a81e892afb350ccf71d6f2b604c7ccac1700cf95bab6aeb for sg0002/HLOToTensorizer +2025-11-04T21:38:35Z INFO 8756 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:35Z INFO 8756 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:35Z INFO 8756 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:35Z INFO 8756 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:35Z INFO 8756 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:35Z INFO 8756 [job.Frontend.0]: Start model loading +2025-11-04T21:38:35Z INFO 8756 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:36Z INFO 8756 [job.Frontend.0]: Num jobs: 12 +2025-11-04T21:38:36Z USER 8756 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:36Z INFO 8756 [Tensorizer]: Max workers: 3 +2025-11-04T21:38:36Z INFO 8924 [Tensorizer]: Building model from Penguin script "penguin.py.000000"... +2025-11-04T21:38:36Z INFO 8929 [Tensorizer]: Building model from Penguin script "penguin.py.000001"... +2025-11-04T21:38:36Z INFO 8925 [Tensorizer]: Building model from Penguin script "penguin.py.000002"... +2025-11-04T21:38:36Z INFO 8924 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:36Z INFO 8924 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:36Z INFO 8929 [Tensorizer]: Allocate SB of shape (128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:36Z INFO 8929 [Tensorizer]: Allocate PSUM of shape (8, 128, 0) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:36Z INFO 8924 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:36Z INFO 8929 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.002 seconds +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.005 seconds +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.004 seconds +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.006 seconds +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:36Z INFO 8929 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.008 seconds +2025-11-04T21:38:36Z INFO 8924 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.027 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.017 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.045 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.025 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.052 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.014 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=2 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.001 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.013 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.026 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.007 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.046 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.006 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.009 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.010 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.022 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.012 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.019 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.034 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.041 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.016 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.075 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.058 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.005 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.141 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.004 seconds +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:37Z INFO 8924 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.026 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.171 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8925 [sg0002/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.022 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:37Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.003 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.042 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.064 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.069 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.047 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.051 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.160 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.172 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.057 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.048 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.062 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.015 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.065 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Rematerialization]: Rematerialization finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.052 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Rematerialization]: Rematerialization finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.016 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.035 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.038 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.032 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.021 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.231 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.250 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.017 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.070 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.097 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.020 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.011 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.061 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.012 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.019 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.024 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.042 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.023 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.017 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.014 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.050 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.031 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LICM]: LICM finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8925 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.011 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.002 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:38Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.014 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.016 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.012 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.016 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LoopFusion]: LoopFusion finished after 0.038 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.016 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.017 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.024 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/LICM]: LICM finished after 0.011 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.038 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LoopFusion]: LoopFusion finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8929 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.024 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/TileCCOps]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/TileCCOps]: TileCCOps finished after 0.018 seconds +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/LICM]: LICM finished after 0.008 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.005 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/TCTransform]: TCTransform finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.002 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:38:39Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.010 seconds +2025-11-04T21:38:39Z INFO 8925 [sg0002/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.078 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.006 seconds +2025-11-04T21:38:39Z INFO 8924 [Tensorizer]: After optimization: 32 statements +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.003 seconds +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:39Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.080 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/TileCCOps]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.007 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.040 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.027 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.023 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LateLowerReshapeOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.067 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.035 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Recompute]: Recompute finished after 0.001 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8925 [Tensorizer]: After optimization: 39 statements +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Simplifier]: Simplifier finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/TileCCOps]: in float32 (512,) %'all_gather.2' = AllGatherOp-162 AllGather_add(float32 (256,) %'add.11', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6459 | hlo_id: 108 | , id = 162 +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=2048 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/TileCCOps]: in uint32 (512,) %'all_gather.3' = AllGatherOp-178 AllGather_add(uint32 (256,) %'add.12', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512,), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.6596 | hlo_id: 117 | , id = 178 +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/TileCCOps]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/TileCCOps]: TileCCOps finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LICM]: LICM finished after 0.003 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_1 +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_1 finished after 0.001 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.046 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.076 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.009 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.022 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.024 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LateLowerReshapeOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.002 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/InferIntrinsicOnCC]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.050 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.090 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.012 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.011 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.141 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.026 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.020 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.013 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LocalLayoutOpt]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.021 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.096 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.097 seconds +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.146 seconds +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:40Z INFO 8924 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:40Z INFO 8929 [sg0001/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.010 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.014 seconds +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:38:40Z INFO 8925 [sg0002/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.005 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.057 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.091 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.057 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.223 seconds +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.079 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.115 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/Delinearization]: Delinearization finished after 0.008 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.023 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.405 seconds +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.029 seconds +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.490 seconds +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/Delinearization]: Delinearization finished after 0.011 seconds +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.029 seconds +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.108 seconds +2025-11-04T21:38:41Z INFO 8925 [sg0002/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.079 seconds +2025-11-04T21:38:41Z INFO 8929 [sg0001/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.586 seconds +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.025 seconds +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.657 seconds +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.010 seconds +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.030 seconds +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.074 seconds +2025-11-04T21:38:41Z INFO 8924 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 37 +total number of sharded dags: 13 + +total bytes transferred from input, output, non local tensors: 433165088 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 407968528 +% bytes transferred with 2x bandwidths: 94.18 + +NC0 FLOPs: 208071778 +NC1 FLOPs: 208061536 +% FLOPs sharded: 100.00 + + +Shard dim: 2048, Number of dags: 6 +Matmuls sharded with this dim: +[2,2048(s),2,6,2,128] @ [2,6,2,128,8,2,128] = [2,2048(s),8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[2,2048(s),2,8,128] @ [2,8,128,2,6,2,128] = [2,2048(s),2,6,2,128] Number of occurrences: 2 + + +Shard dim: 256, Number of dags: 5 +Matmuls sharded with this dim: + + +Shard dim: 4096, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[2,8,128] @ [2,8,128,75968(s)] = [75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 33 +total number of sharded dags: 22 + +total bytes transferred from input, output, non local tensors: 188752902 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 119537664 +% bytes transferred with 2x bandwidths: 63.33 + +NC0 FLOPs: 117964803 +NC1 FLOPs: 76021760 +% FLOPs sharded: 69.94 + + +Shard dim: 4, Number of dags: 13 +Matmuls sharded with this dim: +[2,2048,2,8,128] @ [2,8,128,4(s),128] = [2,2048,4(s),128] Number of occurrences: 1 +[2,2048,2,8,128] @ [2,8,128,4(s),2,64] = [2,2048,4(s),2,64] Number of occurrences: 1 + + +Shard dim: 2, Number of dags: 8 +Matmuls sharded with this dim: +[2,2048,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [2,2048,8,2,128] (stationary-streaming swapped) Number of occurrences: 1 +[2,2048,2,8,128] @ [2,8,128,2(s),2,2,2,64] = [2,2048,2(s),2,2,2,64] Number of occurrences: 1 +[2,2048,2,8,128] @ [2,8,128,2(s),6,2,128] = [2,2048,2(s),6,2,128] Number of occurrences: 2 + + +Shard dim: 2048, Number of dags: 1 +Matmuls sharded with this dim: +[2,2048(s),4,2,128] @ [4,2,128,2,2,4,128] = [2,2048(s),2,2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + + +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.033 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.630 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.029 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.014 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.009 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.027 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(27, 'AG80'), (22, 'AG82'), (23, 'AG81')] +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG80'), (22, 'AG82'), (23, 'AG81')] +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 600 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG80'), (22, 'AG82'), (23, 'AG81')] +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 601 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG89'), (26, 'AG86'), (20, 'AG88'), (25, 'AG87')] +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(27, 'AG80'), (22, 'AG82'), (23, 'AG81')] +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(3, 'AG98'), (24, 'AG97'), (22, 'AG82'), (23, 'AG81')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.679 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.039 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.011 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 32 +total number of sharded dags: 25 + +total bytes transferred from input, output, non local tensors: 123771910 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 115382272 +% bytes transferred with 2x bandwidths: 93.22 + +NC0 FLOPs: 52297731 +NC1 FLOPs: 52297728 +% FLOPs sharded: 95.82 + + +Shard dim: 4096, Number of dags: 23 +Matmuls sharded with this dim: +[4096(s),2,2,4,128] @ [2,2,4,128,2,2,2,2,64] = [4096(s),2,2,2,2,64] Number of occurrences: 1 +[4096(s),2,2,4,128] @ [2,2,4,128,4,128] = [4096(s),4,128] Number of occurrences: 1 +[4096(s),2,2,4,128] @ [2,2,4,128,4,2,64] = [4096(s),4,2,64] Number of occurrences: 1 +[64] @ [4096(s)] = [64,4096(s)] Number of occurrences: 1 + + +Shard dim: 2048, Number of dags: 1 +Matmuls sharded with this dim: +[2,2048(s),4,2,128] @ [4,2,128,2,2,4,128] = [2,2048(s),2,2,4,128] (stationary-streaming swapped) Number of occurrences: 1 + + +Shard dim: 512, Number of dags: 1 +Matmuls sharded with this dim: + + + +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(34, 'AG122'), (29, 'AG124'), (32, 'AG123')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(34, 'AG122'), (29, 'AG124'), (32, 'AG123')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(34, 'AG122'), (29, 'AG124'), (32, 'AG123')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(35, 'AG127'), (14, 'AG129'), (33, 'AG128')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG133'), (9, 'AG130'), (25, 'AG132'), (30, 'AG131')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 659 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(34, 'AG122'), (29, 'AG124'), (32, 'AG123')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 660 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(29, 'AG124'), (34, 'AG122'), (32, 'AG123')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 661 of IO tensor {'CrossPassTensor': ''}bfloat16 %input77|N|(64, 2) is not sorted, index list (w/ AG ids): [(15, 'AG134'), (10, 'AG135')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(29, 'AG124'), (34, 'AG122'), (32, 'AG123')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 663 of IO tensor {'CrossPassTensor': ''}bfloat16 %input75|N|(64, 2) is not sorted, index list (w/ AG ids): [(26, 'AG139'), (20, 'AG140')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 664 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(29, 'AG124'), (34, 'AG122'), (32, 'AG123')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(31, 'AG148'), (23, 'AG150'), (28, 'AG149')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 694 of IO tensor non_local bfloat16 %reshape.68(2, 2, 2, 2, 64, 2, 2048) is not sorted, index list (w/ AG ids): [(11, 'AG142'), (16, 'AG141'), (21, 'AG143'), (7, 'AG126'), (24, 'AG125')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 644 of IO tensor non_local bfloat16 %reshape.73(2, 2, 2, 2, 2048, 128) is not sorted, index list (w/ AG ids): [(12, 'AG146'), (17, 'AG145'), (22, 'AG147'), (7, 'AG126'), (27, 'AG144')] +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.091 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.018 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.085 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/PGTiling]: PGTiling finished after 0.284 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.029 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.521 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.042 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.010 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 1.651 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 192: simd128x512 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 128: simd128x512 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 8: reduce512x1x1 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 8: simd1x512 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 8: reduce512x1x1 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 2: indirect_load128x1 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingBottleneck]: 2: simd1x128 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 2, 4) is not sorted, index list (w/ AG ids): [(31, 'AG99'), (27, 'AG102'), (21, 'AG101'), (29, 'AG100')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 2, 4, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG102'), (31, 'AG99'), (21, 'AG101'), (29, 'AG100')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 635 of IO tensor {'CrossPassTensor': ''}bfloat16 %input66|N|(64, 2) is not sorted, index list (w/ AG ids): [(25, 'AG105'), (22, 'AG108')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 2, 4, 4, 2, 64) is not sorted, index list (w/ AG ids): [(27, 'AG102'), (31, 'AG99'), (21, 'AG101'), (29, 'AG100')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input64|N|(64, 2) is not sorted, index list (w/ AG ids): [(25, 'AG105'), (18, 'AG112')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 2, 4, 4, 128) is not sorted, index list (w/ AG ids): [(27, 'AG102'), (31, 'AG99'), (21, 'AG101'), (29, 'AG100')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 419 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 4, 2, 4, 128) is not sorted, index list (w/ AG ids): [(32, 'AG118'), (28, 'AG120'), (30, 'AG119')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 631 of IO tensor non_local bfloat16 %all_gather.1(2, 2, 4, 128, 2, 2048) is not sorted, index list (w/ AG ids): [(21, 'AG101'), (27, 'AG102'), (29, 'AG100'), (31, 'AG99'), (1, 'AG104')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 520 of IO tensor {'IntermediateTensor': ''}bfloat16 %intermediate0(2, 2048, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(1, 'AG104'), (27, 'AG102'), (21, 'AG101'), (29, 'AG100'), (31, 'AG99')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 582 of IO tensor non_local bfloat16 %reshape.16(2, 2, 2, 2, 64, 2, 2048) is not sorted, index list (w/ AG ids): [(6, 'AG111'), (12, 'AG110'), (17, 'AG109'), (22, 'AG108'), (25, 'AG105'), (1, 'AG104')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 676 of IO tensor non_local bfloat16 %reshape.24(4, 2, 2, 64, 2, 2048) is not sorted, index list (w/ AG ids): [(7, 'AG113'), (13, 'AG114'), (18, 'AG112'), (25, 'AG105'), (1, 'AG104')] +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 614 of IO tensor non_local bfloat16 %reshape.29(4, 2, 2, 2048, 128) is not sorted, index list (w/ AG ids): [(8, 'AG116'), (14, 'AG117'), (1, 'AG104'), (19, 'AG115')] +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.041 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.005 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.046 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.015 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.050 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.013 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.006 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.012 seconds +2025-11-04T21:38:42Z INFO 8925 [sg0002/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:38:42Z INFO 8924 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.146 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/PGTiling]: PGTiling finished after 0.584 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.046 seconds +2025-11-04T21:38:42Z INFO 8929 [sg0001/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.115 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.452 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.545 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.027 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 192: simd128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: simd128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.156 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x1 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x1 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 192: simd128x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: simd128x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 96: dma128x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 64: dma128x1024 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: dma128x2048 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 24: dma128x2048 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 8: reduce512x1x1 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 8: simd1x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PostDLOTilingBottleneck]: 8: reduce512x1x1 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.015 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 2.468 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 128: indirect_load128x256 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 128: simd128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x256 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x256 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingBottleneck]: 64: simd128x512 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.016 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.022 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.014 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.035 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.052 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.065 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.068 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.015 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.017 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.075 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.001 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.016 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.020 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/LICM]: LICM finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.019 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.077 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.019 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.003 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.012 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.040 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.035 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.010 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.008 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.243 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 256: dma128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: indirect_load128x256 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: simd128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: dma128x512 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 64: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.016 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.009 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.044 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.045 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/InferInitValue]: InferInitValue finished after 0.101 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.026 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.026 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.351 seconds +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 3072: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_128x128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 256: dma128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 192: simd128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: simd128x512 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: rmsnorm128x512x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: transpose_128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/PostDLOTilingBottleneck]: 128: generic_store128x128 +2025-11-04T21:38:43Z INFO 8929 [sg0001/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.027 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.018 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.004 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/LICM]: LICM finished after 0.005 seconds +2025-11-04T21:38:43Z INFO 8925 [sg0002/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.025 seconds +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:43Z INFO 8924 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.015 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/SundaISel]: SundaISel finished after 0.071 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.027 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.027 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.021 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.011 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.105 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.029 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.064 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.023 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.011 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.014 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.025 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/LICM]: LICM finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.019 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.054 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.024 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.083 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.011 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.024 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.013 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.079 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.053 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.063 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/InferInitValue]: InferInitValue finished after 0.079 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.027 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.039 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.041 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.027 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.038 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.004 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.024 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.006 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/DeConcat]: DeConcat finished after 0.002 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.018 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LICM]: Running LICM +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.069 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.008 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LICM]: LICM finished after 0.007 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.021 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.021 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.109 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/SundaISel]: SundaISel finished after 0.071 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.001 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.029 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.124 seconds +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:44Z INFO 8924 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.005 seconds +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:44Z INFO 8929 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.111 seconds +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:44Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.024 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.025 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.025 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.010 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.009 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.045 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.013 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.018 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.029 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.004 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.035 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.035 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.025 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.012 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.019 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.076 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.080 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SpillPSum]: SpillPSum finished after 0.033 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.043 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.004 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.121 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LegalizeType]: LegalizeType finished after 0.023 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.005 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.032 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.083 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.023 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.084 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.051 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.042 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.036 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.069 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.089 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.066 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.067 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.039 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.024 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.014 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.011 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.026 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.139 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.011 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.007 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.011 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.162 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronValueNumbering]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.020 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.009 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.025 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.010 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.008 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SimplifyNeuronTensor]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.014 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.017 seconds +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:45Z INFO 8925 [sg0002/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.018 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.045 seconds +2025-11-04T21:38:45Z INFO 8924 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:38:45Z INFO 8929 [sg0001/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/DeConcat]: DeConcat finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.005 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.044 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.019 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DataStreaming]: DataStreaming finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.130 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.037 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.134 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.080 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.074 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.155 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.054 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.013 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.241 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.016 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.087 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 42.688% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'996.1576'[i31_0,4i31_1_0_0+i31_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i31_0,i0.128+512i31_1_0_0+128i31_1_0_1,i2.16,i1.128] # id=1575, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_996 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 488.243us (96.000MiB, est bw: 206.175GB/s, 13.685% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[6] bfloat16 (2, 2, 2, 6, 2, 2, 128, 2048) %1539[i11_1_0,i11_0,i11_1_1_0,2i10_0_0_1_0+i10_0_0_1_1,i10_0_0_0,c2_1046,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input366'[i10_0_0_0,2i10_0_0_1_0+i10_0_0_1_1,i0.128,c2_1046,i1.2048] # id=1374, src_id=None, , instances=192 # dl = tensor_op_name: _dot.197 | hlo_id: 52 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 488.243us (96.000MiB, est bw: 206.175GB/s, 13.685% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[6] bfloat16 (2, 2, 2, 6, 2, 2, 128, 2048) %1536[i16_1_0_1081,i16_0_1081,i13_1_1_0,2i12_0_0_1_0+i12_0_0_1_1,i12_0_0_0,c2_1057,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input368'[i12_0_0_0,2i12_0_0_1_0+i12_0_0_1_1,i0.128,c2_1057,i1.2048] # id=1377, src_id=None, , instances=192 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 395.777us (48.000MiB, est bw: 127.172GB/s, 11.093% of tot. time) for bfloat16<128 x 512> TongaSB partitions[6] bfloat16 (2, 2, 2, 2, 2, 6, 128, 2, 512) %1538[i16_1_0_1081,i16_0_1081,i15_0_0_0_1,i15_0_0_0_0,c1_1067_2055,c2_1068_2055,i0.128,i3.2,i1.128+128i2.2+256p_1684_2055] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 6, 2, 2, 128) %'input365'[i15_0_0_0_1+2i15_0_0_0_0,p_1684_2055,c1_1067_2055,i0.128,c2_1068_2055,i3.2,i2.2,i1.128] # id=1383, src_id=None, , instances=384 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.128, i2.2, i3.2]] -> [[i0.128];[i1.128, i2.2, i3.2]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 193.732us (300.000KiB, est bw: 1.586GB/s, 5.430% of tot. time) for float32<1 x 128> {'no_delinear': '0'}non_local float32 (1, 2, 37984) %'convert.55'[0,i31_0,i0.128+512i31_1_0_0+128i31_1_0_1] = store float32<1 x 128> TongaSB partitions[2] float32 (2, 297, 1, 128) %'dot.200.1586'[i31_0,4i31_1_0_0+i31_1_0_1,0,i0.128] # id=1584, src_id=None, , instances=600 # dl = tensor_op_name: _dot.200 | hlo_id: 95 | if -i0.128-512i31_1_0_0-128i31_1_0_1+37983 >= 0 and -4i31_1_0_0-i31_1_0_1+296 >= 0 [[];[i0.128]] -> [[];[i0.128]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 2.311% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %'1000.1660'[i11_1_0,i11_0,i11_1_1_0,T_i3_0_2053,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 2, 2, 512, 2048) %'add.9'[i11_0,i11_1_0,i11_1_1_0,i0.128+128T_i3_0_2053,i1.2048] # id=1550, src_id=None, , instances=32 # dl = tensor_op_name: add.9_pftranspose_1000 | hlo_id: 27 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 2.311% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2, 2, 512) %'_reload_1532'[i16_1_0_1081,i16_0_1081,i13_1_1_0,i4_0_1_1535_2054_0,i0.128,i3.2,i2.2,i1.512] = load bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (4, 2, 2, 2, 128, 2048) %'_spill_1529'[i4_0_1_1535_2054_0,i16_0_1081,i16_1_0_1081,i13_1_1_0,i0.128,i1.512+1024i2.2+512i3.2] # id=1534, src_id=None, , instances=32 # dl = tensor_op_name: _dot.198 | hlo_id: 42 | [[i0.128];[i1.512, i2.2, i3.2]] -> [[i0.128];[i1.512, i2.2, i3.2]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 2.311% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %'1004.1665'[T_i20_1_0_1012,T_i20_0_1012,T_i20_1_1_0_1012,T_i3_0_2056,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 2, 512, 2048) %'all_reduce.3'[T_i20_0_1012,0,T_i20_1_0_1012,T_i20_1_1_0_1012,i0.128+128T_i3_0_2056,i1.2048] # id=1559, src_id=None, , instances=32 # dl = tensor_op_name: all_reduce.3_pftranspose_1004 | hlo_id: 66 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 1.395% of tot. time) for bfloat16<128 x 2048> DRAM3DBlk partitions[4] bfloat16 (4, 2, 2, 2, 128, 2048) %'_spill_1529'[i2_0_1_1624_2058_0,i11_0,i11_1_0,i11_1_1_0,i0.128,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %1019[i11_1_0,i11_0,i11_1_1_0,i2_0_1_1624_2058_0,i0.128,i1.2048] # id=1531, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.348 | hlo_id: 34 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Est. DMA time: 49.765us (16.000MiB, est bw: 337.130GB/s, 1.395% of tot. time) for bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 8, 128, 2048) %'dot.14'[i16_0_1081,0,i16_1_0_1081,4i16_1_1_0_0_1081_1537+i16_1_1_0_1_1081_1537,i0.128,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2048) %1082[i16_1_0_1081,i16_0_1081,i16_1_1_0_0_1081_1537,i16_1_1_0_1_1081_1537,i0.128,i1.2048] # id=1386, src_id=None, , instances=32 # dl = tensor_op_name: _dot.199 | hlo_id: 63 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.259 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.010 seconds +2025-11-04T21:38:46Z INFO 8925 [sg0002/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.040 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.042 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.027 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.043 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.018 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.016 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.003 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.029 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.066 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.014 seconds +2025-11-04T21:38:46Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.066 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.008 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.027 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.023 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.006 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.007 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.038 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.018 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.020 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.004 seconds +2025-11-04T21:38:46Z INFO 8929 [sg0001/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.062 seconds +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:46Z INFO 8925 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.019 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SpillPSum]: SpillPSum finished after 0.093 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.025 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.032 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.067 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.024 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.004 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.025 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LegalizeType]: LegalizeType finished after 0.009 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.007 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.032 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.009 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.051 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.013 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.047 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.261 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.100 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.019 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_2 +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_2 finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.115 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.053 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.677 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.010 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.021 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.008 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.006 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.013 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/ExpandISAMacro]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.016 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.011 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.005 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.025 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.011 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.008 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.014 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 13.854% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[6] bfloat16 (2, 2, 2, 2, 2, 2, 128, 2048) %1928[i34_0,i34_1_0_0_0,i34_1_0_0_1,i35_0_0,c1_1573,c2_1574,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 2, 2048) %'input67'[i35_0_0,c1_1573,i0.128,c2_1574,i1.2048] # id=1787, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2 | hlo_id: 32 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 100.987us (8.000MiB, est bw: 83.066GB/s, 8.551% of tot. time) for bfloat16<128 x 256> TongaSB partitions[4] bfloat16 (2, 2, 2, 16, 128, 256) %'transpose.1_pftranspose_1487'[T_i2_1_0_1491,T_i2_0_1491,T_i3_0_1491_0,i3_0_1,i0.128,i1.256] = indirect_load bfloat16<128 x 256> {'CrossPassTensor': ''}bfloat16 (151936, 2, 2, 256) %'input60'[i0.128,T_i2_0_1491,T_i2_1_0_1491,i1.256] generic generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[1] int32 (2, 128, 32, 1) %'input0_local_1529'[T_i2_1_0_1491,i0.128,16T_i3_0_1491_0+i3_0_1,0] # id=1741, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=128 # dl = tensor_op_name: _gather.41 | hlo_id: 12 | [[i0.128];[i1.256]] -> [[i0.128];[i1.256]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 6.982% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2, 2, 4, 128) %'1492.2065'[i0_0_1535,i1_1_0_1535,T_i1,T_i2_2691,i0.128,i4.2,i3.2,i2.4,i1.128] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 2, 4, 512) %'all_gather.1'[i1_1_0_1535,0,T_i1,T_i2_2691,i0.128,i0_0_1535,i3.2+2i4.2,i1.128+128i2.4] # id=1971, src_id=None, , instances=32 # dl = tensor_op_name: all_gather.1_pftranspose_1492 | hlo_id: 15 | [[i0.128];[i1.128, i2.4, i3.2, i4.2]] -> [[i0.128];[i1.128, i2.4, i3.2, i4.2]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 6.982% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 2, 2, 4, 128, 2, 2, 512) %'custom-call.177.1925'[i34_0,i16_0_1_0_1562_1927,i16_0_0_1562_1927,i16_0_1_1_1562_1927,i0.128,i3.2,i2.2,i1.512] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 2, 4, 128, 2, 4, 512) %'all_gather.1'[i16_0_1_0_1562_1927,0,i16_0_0_1562_1927,i16_0_1_1_1562_1927,i0.128,i34_0,i2.2+2i3.2,i1.512] # id=1782, src_id=None, , instances=32 # dl = tensor_op_name: _custom-call.177 | hlo_id: 24 | [[i0.128];[i1.512, i2.2, i3.2]] -> [[i0.128];[i1.512, i2.2, i3.2]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 6.982% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %'_reload_1934'[i64_0,i64_1_0_0_0,i64_1_0_0_1,i48_0_1_0_1937,i48_0_0_1937,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %'_spill_1931'[i48_0_1_0_1937,i48_0_0_1937,i64_0,i64_1_0_0_0,i64_1_0_0_1,i0.128,i1.2048] # id=1936, src_id=None, , instances=32 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 6.982% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %'_reload_1934_reload_1940'[i2_0_1619,i2_1_0_1619_0_0,i2_1_0_1619_0_1,i48_0_1_0_1937_1939,i48_0_0_1937_1939,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM3DBlk partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %'_spill_1931'[i48_0_1_0_1937_1939,i48_0_0_1937_1939,i2_0_1619,i2_1_0_1619_0_0,i2_1_0_1619_0_1,i0.128,i1.2048] # id=1938, src_id=None, , instances=32 # dl = tensor_op_name: _dot.1 | hlo_id: 88 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 6.982% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %1930[i96_1_0_1646,i96_0_1646,i95_0_0_1,i95_0_0_0_2693,c2_1634_0_2692_2693,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input61'[i95_0_0_0_2693,i95_0_0_1,i0.128,i1.2048+2048c2_1634_0_2692_2693] # id=1888, src_id=None, , instances=32 # dl = tensor_op_name: _dot.3 | hlo_id: 147 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 59.748us (16.000MiB, est bw: 280.799GB/s, 5.059% of tot. time) for bfloat16<128 x 1024> {'IntermediateTensor': ''}bfloat16 (2, 16, 128, 2, 2, 512) %'intermediate0'(init=0.0)[i0_0_1535,8i0_1_0_0_1535_0+4i0_1_0_0_1535_1+i0_1_0_1_1535,i0.128,i3.2,i1_1_0_1535,i1.128+128i2.4] = store bfloat16<128 x 1024> TongaSB partitions[3] bfloat16 (2, 2, 2, 128, 2, 8, 4, 128) %'all_gather.1_pftranspose_1492'[i0_0_1535,i1_1_0_1535,i0_1_0_0_1535_0,i0.128,i3.2,4i0_1_0_0_1535_1+i0_1_0_1_1535,i2.4,i1.128] # id=1747, src_id=None, , instances=64 # dl = tensor_op_name: UnnamedModule | hlo_id: 1 | [[i0.128];[i1.128, i2.4, i3.2]] -> [[i0.128];[i1.128, i2.4, i3.2]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 51.172us (4.000MiB, est bw: 81.965GB/s, 4.333% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output2'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[6] int32 (2, 2, 2, 2, 2, 4, 128, 1) %'scatter.6719.2238'[i111_0,i105_0,i105_1,i104_1_0_0_0,i104_1_0_0_1,i104_1_0_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 4, 2, 128) %'transpose.19'[i111_0,i104_1_0_0_0,i104_1_0_0_1,i105_0,i0.128,i104_1_0_1,i105_1,i1.128] # id=1906, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=128 # dl = tensor_op_name: _scatter.6719 | hlo_id: 187 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 51.172us (4.000MiB, est bw: 81.965GB/s, 4.333% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output1'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[6] int32 (2, 2, 2, 2, 2, 4, 128, 1) %'scatter.6667.2242'[i111_0,i112_0,i112_1,i111_1_0_0_0,i111_1_0_0_1,2i111_1_0_1_0+i111_1_0_1_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 512) %'add.2'[i111_0,i112_0,i111_1_0_0_0,i111_1_0_0_1,i111_1_0_1_0,i0.128,i1.128+128i112_1+256i111_1_0_1_1] # id=1912, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=128 # dl = tensor_op_name: _scatter.6667 | hlo_id: 172 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/DataStreaming]: DataStreaming finished after 0.023 seconds +2025-11-04T21:38:47Z INFO 8929 [sg0001/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.112 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.016 seconds +2025-11-04T21:38:47Z INFO 8924 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.015 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.005 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.009 seconds +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:47Z INFO 8925 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:47Z INFO 8924 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:48Z INFO 8924 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8924 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8924 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8924 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 101308) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.013 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.369 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.009 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2630) %4(init=0.0)[i0.32,i1.2374] = load float32<32 x 2374> float32 (32, 2374) %6[i0.32,i1.2374] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 4.177us (296.750KiB, est bw: 72.741GB/s, 20.220% of tot. time) for float32<32 x 2374> TongaSB partitions[0] float32 (32, 2374) %10[i0.32,i1.2374] = load float32<32 x 2374> float32 (1, 75968) %'inp'[i0.32,i1.2374] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.2374]] -> [[i0.32];[i1.2374]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 9.509% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 9.301% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 7.936% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 7.789% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.009 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.010 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.160 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.017 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.138 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.138 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.555 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.023 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.021 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.008 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.010 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.015 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 488.243us (96.000MiB, est bw: 206.175GB/s, 23.485% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[6] bfloat16 (2, 2, 2, 2, 6, 2, 128, 2048) %1719[i_shard_1546,i16_0_1354,i16_1_0_0_1354_0,i16_1_0_0_1354_1,i10_0_0_1,c2_1319,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input69'[i_shard_1546,i10_0_0_1,i0.128,c2_1319,i1.2048] # id=1597, src_id=None, , instances=192 # dl = tensor_op_name: _dot.4 | hlo_id: 40 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 488.243us (96.000MiB, est bw: 206.175GB/s, 23.485% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[6] bfloat16 (2, 2, 2, 2, 6, 2, 128, 2048) %1720[i_shard_1546,i16_0_1354,i16_1_0_0_1354_0,i16_1_0_0_1354_1,i12_0_0_1,c2_1330,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 6, 128, 2, 2048) %'input71'[i_shard_1546,i12_0_0_1,i0.128,c2_1330,i1.2048] # id=1600, src_id=None, , instances=192 # dl = tensor_op_name: _dot.5 | hlo_id: 30 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 7.870% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 4, 128, 2048) %'1272.1861'[i_shard_1546,i16_0_1354,i16_1_0_0_1354_0,i16_1_0_0_1354_1,T_i2_0_2451,i0.128,i1.2048] = load bfloat16<128 x 2048> non_local bfloat16 (2, 4, 512, 2048) %'add.4'[i16_0_1354,2i16_1_0_0_1354_0+i16_1_0_0_1354_1,i0.128+128T_i2_0_2451,i1.2048] # id=1726, src_id=None, , instances=64 # dl = tensor_op_name: add.4_pftranspose_1272 | hlo_id: 15 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 7.870% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 4, 128, 2048) %'1276.1866'[i38_0_0,i37_0,i37_1_0_1,i37_1_0_0,T_i2_0_2453,i0.128,i1.2048] = load bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 4, 512, 2048) %'all_reduce.1'[i37_0,0,i37_1_0_1+2i37_1_0_0,i0.128+128T_i2_0_2453,i1.2048] # id=1737, src_id=None, , instances=64 # dl = tensor_op_name: all_reduce.1_pftranspose_1276 | hlo_id: 54 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 99.919us (12.000MiB, est bw: 125.931GB/s, 4.806% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 2, 2, 6, 128, 2, 512) %'input68_local_1348'[i_shard_1546,c0_1339_0,c0_1339_1,c2_1341,i0.128,i3.2,i1.128+128i2.2+256p_1891] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (4, 2, 2, 128, 6, 2, 2, 128) %'input68'[2c0_1339_0+c0_1339_1,p_1891,i_shard_1546,i0.128,c2_1341,i3.2,i2.2,i1.128] # id=1606, src_id=None, , instances=96 # dl = tensor_op_name: _dot.6 | hlo_id: 51 | [[i0.128];[i1.128, i2.2, i3.2]] -> [[i0.128];[i1.128, i2.2, i3.2]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 98.230us (32.000MiB, est bw: 341.592GB/s, 4.725% of tot. time) for bfloat16<128 x 2048> DRAM2DBlk partitions[1] bfloat16 (2, 1, 16, 128, 2048) %'dot.7'[i16_0_1354,0,8i16_1_0_0_1354_0+4i16_1_0_0_1354_1+2i16_1_0_1_1354_0_2452+i16_1_0_1_1354_1_2452,i0.128,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 4, 128, 2048) %1583[i_shard_1546,i16_0_1354,i16_1_0_0_1354_0,i16_1_0_0_1354_1,2i16_1_0_1_1354_0_2452+i16_1_0_1_1354_1_2452,i0.128,i1.2048] # id=1611, src_id=None, , instances=64 # dl = tensor_op_name: _dot.6 | hlo_id: 51 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 98.230us (32.000MiB, est bw: 341.592GB/s, 4.725% of tot. time) for bfloat16<128 x 2048> {'IntermediateTensor': ''}bfloat16 (1, 2, 4, 512, 2048) %'intermediate5'(init=0.0)[0,i37_0,i37_1_0_1+2i37_1_0_0,i0.128+128T_i18_1_1_0_1284,i1.2048] = store bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 4, 128, 2048) %'1280.1745'[i38_0_0,i37_0,i37_1_0_1,i37_1_0_0,T_i18_1_1_0_1284,i0.128,i1.2048] # id=1743, src_id=None, , instances=64 # dl = tensor_op_name: intermediate5_pftranspose_1280 | hlo_id: 2 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 82.457us (16.000MiB, est bw: 203.466GB/s, 3.966% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 2048) %1722[i99_1_0_1467,i99_0_1467,i98_0_0_1,i98_0_0_0_2459,c2_1455_0_2458_2459,i0.128,i1.2048] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 2, 128, 4096) %'input72'[i98_0_0_0_2459,i98_0_0_1,i0.128,i1.2048+2048c2_1455_0_2458_2459] # id=1676, src_id=None, , instances=32 # dl = tensor_op_name: _dot.10 | hlo_id: 173 | [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 51.172us (4.000MiB, est bw: 81.965GB/s, 2.461% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output4'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[6] int32 (2, 2, 2, 2, 2, 4, 128, 1) %'scatter.6821.1912'[i115_0,i107_0,i108_1_2461,i107_1_0_0_0_2461,i107_1_0_0_1_2461,i107_1_0_1_2461,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[4] bfloat16 (2, 2, 2, 2, 128, 4, 2, 128) %'transpose.45'[i115_0,i107_0,i107_1_0_0_0_2461,i107_1_0_0_1_2461,i0.128,i107_1_0_1_2461,i108_1_2461,i1.128] # id=1694, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=128 # dl = tensor_op_name: _scatter.6821 | hlo_id: 207 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Est. DMA time: 51.172us (4.000MiB, est bw: 81.965GB/s, 2.461% of tot. time) for bfloat16<128 x 128> bfloat16 (8, 4, 4096, 128) %'output3'[i0.128,i1.128] generic, generic_dims:[0] generic_addrs: int32<128 x 1> TongaSB partitions[6] int32 (2, 2, 2, 2, 2, 4, 128, 1) %'scatter.6769.1916'[i115_0,i114_0,i115_1_2462,i114_1_0_0_0_2462,i114_1_0_0_1_2462,2i114_1_0_1_2462_0+i114_1_0_1_2462_1,i0.128,0] = indirect_save bfloat16<128 x 128> TongaSB partitions[5] bfloat16 (2, 2, 2, 2, 2, 128, 512) %'add.7'[i115_0,i114_0,i114_1_0_0_0_2462,i114_1_0_0_1_2462,i114_1_0_1_2462_0,i0.128,i1.128+128i115_1_2462+256i114_1_0_1_2462_1] # id=1700, src_id=None, , attrs={'mode': OOBMode.ERROR}, instances=128 # dl = tensor_op_name: _scatter.6769 | hlo_id: 192 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.096 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.017 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DoNothing]: DoNothing finished after 0.003 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.016 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.012 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.050 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.022 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.026 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.051 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.016 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.002 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.000 seconds +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [attention_isa_kernel/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate SB of shape (128, 101308) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/OptimizeNKIKernels]: Allocate PSUM of shape (8, 128, 2048) for CausalAttentionMMSoftmaxMMWithoutSwap +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.030 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.319 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.016 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.008 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.018 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.028 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.029 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.033 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.123 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.123 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.015 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.119 seconds +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8924 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.120 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.079 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.171 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.013 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.008 seconds +2025-11-04T21:38:48Z INFO 8925 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.009 seconds +2025-11-04T21:38:48Z INFO 8929 [sg0001/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.009 seconds +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.050 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.012 seconds +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.022 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.034 seconds +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.032 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.032 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.009 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.060 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.133 seconds +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.008 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.135 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.011 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.173 seconds +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.013 seconds +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.008 seconds +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.151 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.007 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.123 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.023 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:49Z INFO 8924 [Tensorizer]: BirCodeGen estimate #instances=5518 in sg0000 +2025-11-04T21:38:49Z INFO 8924 [Tensorizer]: IR signature: 07e64d8a3154b299d78eeef52b1d2ccc87afab3f8cad7a7620a5759cc0248f69 for nc00/sg0000/TensorizerBIR +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.104 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.027 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:49Z INFO 8929 [Tensorizer]: BirCodeGen estimate #instances=11337 in sg0001 +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8929 [Tensorizer]: IR signature: de7add1807d963ea768768a7a87d67ded3abfcb31ace6a3445949be60c4ceaa8 for nc00/sg0001/TensorizerBIR +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.014 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.012 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8924 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.132 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.009 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 272) %4(init=0.0)[i0.32,i1.16] = load float32<32 x 16> float32 (32, 16) %6[i0.32,i1.16] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.014us (2.000KiB, est bw: 1.017GB/s, 12.329% of tot. time) for float32<32 x 16> TongaSB partitions[0] float32 (32, 16) %10[i0.32,i1.16] = load float32<32 x 16> float32 (1, 512) %'inp'[i0.32,i1.16] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.16]] -> [[i0.32];[i1.16]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.965us (4.000KiB, est bw: 2.085GB/s, 12.028% of tot. time) for float32<32 x 32> TongaSB partitions[0] float32 (32, 32) %485[i0.32,i1.32] = load float32<32 x 32> float32 (32, 32) %3[i0.32,i1.32] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.32]] -> [[i0.32];[i1.32]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %316[0,i0.256] = load float32<1 x 256> float32 (32, 8) %304[0,i0.256] # id=306, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.922us (1.000KiB, est bw: 0.533GB/s, 11.765% of tot. time) for uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %319[0,i0.256] = load float32<1 x 256> float32 (32, 8) %307[0,i0.256] # id=309, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for uint32<1 x 256> uint32 (1, 256) %'topk_indices'[0,i0.256] = store uint32<1 x 256> TongaSB partitions[0] uint32 (1, 256) %'global_id_buf'(init=0.0)[0,i0.256] # id=322, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.640us (1.000KiB, est bw: 0.625GB/s, 10.038% of tot. time) for float32<1 x 256> float32 (1, 256) %'topk_values'[0,i0.256] = store float32<1 x 256> TongaSB partitions[0] float32 (1, 256) %'val_buf'(init=0.0)[0,i0.256] # id=324, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[];[i0.256]] -> [[];[i0.256]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %304[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %296[i0.32,i1.8] # id=305, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.609us (1.000KiB, est bw: 0.636GB/s, 9.852% of tot. time) for float32<32 x 8> float32 (32, 8) %307[i0.32,i1.8] = store float32<32 x 8> TongaSB partitions[0] float32 (32, 8) %517[i0.32,i1.8] # id=308, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.32];[i1.8]] -> [[i0.32];[i1.8]] +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.013 seconds +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.008 seconds +2025-11-04T21:38:49Z INFO 8929 [sg0001/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.100 seconds +2025-11-04T21:38:49Z INFO 8929 [Tensorizer]: BirCodeGen estimate #instances=11337 in sg0001 +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8929 [Tensorizer]: IR signature: c1e9e541c6d366437d7098e886e45d7adfc83027a821f481f5c55ed1428de18a for nc01/sg0001/TensorizerBIR +2025-11-04T21:38:49Z INFO 8929 [Tensorizer]: Weights total number of bytes: 262146 +2025-11-04T21:38:49Z INFO 8929 [Tensorizer]: Successfully built model. +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8924 [Tensorizer]: BirCodeGen estimate #instances=5518 in sg0000 +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8924 [Tensorizer]: IR signature: 5c6260aeea6d2ade5b9f6bd85f587c701a4621dc159195f4b546745d1f7ac2ad for nc01/sg0000/TensorizerBIR +2025-11-04T21:38:49Z INFO 8924 [Tensorizer]: Weights total number of bytes: 327938 +2025-11-04T21:38:49Z INFO 8924 [Tensorizer]: Successfully built model. +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:49Z INFO 8925 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.008 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.010 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.003 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.011 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.002 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.013 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (1, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (1, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if i0.128 == 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 4.115 seconds +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.093 seconds +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.094 seconds +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.047 seconds +2025-11-04T21:38:50Z INFO 8925 [sg0002/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.158 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.038 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.034 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.045 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.112 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.112 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.035 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.045 seconds +2025-11-04T21:38:51Z INFO 8925 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:52Z INFO 8925 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:52Z INFO 8925 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.570 seconds +2025-11-04T21:38:52Z INFO 8925 [Tensorizer]: BirCodeGen estimate #instances=31206 in sg0002 +2025-11-04T21:38:52Z INFO 8925 [Tensorizer]: IR signature: 768e9d874939c96c84d087d7ade9395f5e8990d27cc7e3aee7288ab38aaa6b7e for nc00/sg0002/TensorizerBIR +2025-11-04T21:38:52Z INFO 8925 [sg0002/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:52Z INFO 8925 [sg0002/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:53Z INFO 8925 [sg0002/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.384 seconds +2025-11-04T21:38:53Z INFO 8925 [Tensorizer]: BirCodeGen estimate #instances=31206 in sg0002 +2025-11-04T21:38:53Z INFO 8925 [Tensorizer]: IR signature: 10024b7ea18804962ff21eb8a696eedb78890a98f45a9cf2feb7408765a0dabf for nc01/sg0002/TensorizerBIR +2025-11-04T21:38:53Z INFO 8925 [Tensorizer]: Weights total number of bytes: 410376 +2025-11-04T21:38:53Z INFO 8925 [Tensorizer]: Successfully built model. +2025-11-04T21:38:53Z USER 8756 [root/Tensorizer/Tensorizer]: Tensorizer finished after 16.704 seconds +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: End tensorization +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input60 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input0 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input63 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input67 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input66 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input65 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input64 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input62 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input61 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input4 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input5 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input70 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input71 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input69 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input68 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input74 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input78 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input77 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input76 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input75 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input73 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input72 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input6 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input7 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input367 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input368 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input366 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input365 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input370 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input369 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Network input: input3 +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:53Z INFO 8756 [job.Frontend.0]: Job #0 finished +2025-11-04T21:38:53Z INFO 8756 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:38:53Z INFO 8756 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:38:53Z INFO 8756 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:38:53Z INFO 8756 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: BackendDriver has 6 states with 2 core LNC +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: BackendDriver: found partitions within VNC, using VNC + MT modular flow. +2025-11-04T21:38:53Z INFO 8756 [job.BIRLinker.1]: Creating directory nc00/sgLnk/sg00 +2025-11-04T21:38:53Z INFO 8756 [job.BIRLinker.2]: Creating directory nc01/sgLnk/sg00 +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: BackendDriver in_state.num_states 6 with 2 core LNC +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/log-neuron-cc.txt -o walrus_bir.out.json --enable-call-graph --enable-mt-backend --link-subgraphs nc00/sg00,nc01/sg00,nc00/sg01,nc01/sg01,nc00/sg02,nc01/sg02 --link-dir sgLnk/sg00 --vnc-nc-per-sengine 2 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-lower-bound 0.14 --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --enable-internal-partitioner --dge-levels scalar_dynamic_offset,spill_reload,vector_dynamic_offsets,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:38:53Z INFO 8756 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:38:53Z INFO 9099 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Loading module from nc01/sg01/bir.json +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Loading module from nc00/sg02/bir.json +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Loading module from nc01/sg02/bir.json +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Loading module from nc00/sg01/bir.json +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Backend driver mtBackend: true numModules: 6 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l" +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Modular flow call graph is enabled +2025-11-04T21:38:53Z INFO 9099 [BackendDriver]: Internal partitioner is enabled +2025-11-04T21:38:53Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:53Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1904 blocks=6 instructions=1574 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:53Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:53Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:53Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:53Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:53Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:53Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:53Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:53Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:53Z USER 9099 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:53Z USER 9099 (nc00/sg02) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:53Z USER 9099 (nc00/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:53Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:53Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 131mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 131mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 151 memory location(s), 1 block(s), and 49 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:53Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:53Z USER 9099 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:38:53Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:53Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 590 memory location(s), 1 block(s), and 682 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:53Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z USER 9099 (nc01/sg01) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:38:53Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:53Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 131mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 131mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 211 memory location(s), 1 block(s), and 56 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:53Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:53Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:53Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 151 memory location(s), 1 block(s), and 49 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:53Z USER 9099 (nc01/sg02) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:38:53Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 131mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:53Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 590 memory location(s), 1 block(s), and 682 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:53Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 131mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:53Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 211 memory location(s), 1 block(s), and 56 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:53Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:53Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:53Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:53Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z USER 9099 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.080 seconds +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 181mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 211 memory location(s), 1 block(s), and 56 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z USER 9099 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.107 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 199mb, ru_maxrss: 219mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 211 memory location(s), 1 block(s), and 56 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z USER 9099 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.161 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 249mb, ru_maxrss: 249mb (delta=30mb) +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 151 memory location(s), 1 block(s), and 49 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z USER 9099 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.193 seconds +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 273mb, ru_maxrss: 273mb (delta=54mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 151 memory location(s), 1 block(s), and 49 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z USER 9099 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.256 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 313mb, ru_maxrss: 313mb (delta=94mb) +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 590 memory location(s), 1 block(s), and 682 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z USER 9099 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.278 seconds +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=98mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 590 memory location(s), 1 block(s), and 682 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:54Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.283 seconds +2025-11-04T21:38:54Z INFO 9099 [BackendPassManager]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=98mb) +2025-11-04T21:38:54Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:54Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=1904 blocks=6 instructions=1574 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z USER 9099 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:54Z USER 9099 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:54Z USER 9099 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:54Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=302 blocks=2 instructions=98 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z USER 9099 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=1180 blocks=2 instructions=1364 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 302 memory location(s), 2 block(s), and 98 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z USER 9099 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:38:54Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 1180 memory location(s), 2 block(s), and 1364 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=422 blocks=2 instructions=112 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z USER 9099 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.004 seconds +2025-11-04T21:38:54Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 422 memory location(s), 2 block(s), and 112 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:54Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.007 seconds +2025-11-04T21:38:54Z INFO 9099 [BackendPassManager]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:54Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=1904 blocks=6 instructions=1574 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:54Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:54Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:54Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:54Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:54Z USER 9099 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:54Z USER 9099 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:54Z USER 9099 (nc00/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 211 memory location(s), 1 block(s), and 56 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:54Z USER 9099 (nc00/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z USER 9099 (nc01/sg02) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 211 memory location(s), 1 block(s), and 56 instruction(s). Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 317mb, ru_maxrss: 317mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 151 memory location(s), 1 block(s), and 49 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=211 blocks=1 instructions=56 Max writers: 4 Max Readers: 9 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 590 memory location(s), 1 block(s), and 682 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:54Z INFO 9099 (nc00/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 590 memory location(s), 1 block(s), and 682 instruction(s). Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running unroll +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=590 blocks=1 instructions=682 Max writers: 65 Max Readers: 64 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg02) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:54Z USER 9099 (nc01/sg01) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 318mb, ru_maxrss: 318mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 151 memory location(s), 1 block(s), and 49 instruction(s). Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running unroll +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=151 blocks=1 instructions=49 Max writers: 4 Max Readers: 8 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg01) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:54 2025 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:54 2025 + +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Total count: 5515 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Matmult: 2561 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: TensorScalarPtr: 894 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: TensorTensor: 776 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: GenericCopy: 539 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Activation: 268 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: DMACopy: 192 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Load: 150 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Save: 106 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Memset: 11 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: CollectiveCompute: 4 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 192 +2025-11-04T21:38:54Z USER 9099 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.459 seconds +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 553mb, ru_maxrss: 553mb (delta=236mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:54 2025 + +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Total count: 5518 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Matmult: 2561 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: TensorScalarPtr: 894 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: TensorTensor: 776 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: GenericCopy: 539 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Activation: 268 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: DMACopy: 194 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Load: 150 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Save: 107 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Memset: 11 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: CoreBarrier: 4 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: CollectiveCompute: 4 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Select: 1 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 192 +2025-11-04T21:38:54Z USER 9099 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.466 seconds +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 529mb, ru_maxrss: 553mb (delta=236mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5818 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:54Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5818 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5818 memory location(s), 1 block(s), and 5515 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:54Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=5818 blocks=1 instructions=5515 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:54Z USER 9099 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.048 seconds +2025-11-04T21:38:54Z USER 9099 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.050 seconds +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 542mb, ru_maxrss: 553mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2861 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 537mb, ru_maxrss: 553mb (delta=0mb) +2025-11-04T21:38:54Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2860 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:54 2025 + +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:54 2025 + +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Total count: 11337 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Matmult: 8240 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: TensorTensor: 672 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: GenericCopy: 652 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: TensorScalarPtr: 634 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Activation: 464 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Load: 364 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: DMACopy: 131 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Save: 105 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: CollectiveCompute: 36 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Memset: 24 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: CoreBarrier: 5 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 128 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: unroll finished after 0.849 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 708mb, ru_maxrss: 708mb (delta=391mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: sg0001 Instruction count after Unroll: +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Total count: 11270 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Matmult: 8240 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: TensorTensor: 672 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: GenericCopy: 652 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: TensorScalarPtr: 634 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Activation: 464 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Load: 364 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: DMACopy: 129 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Save: 40 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: CollectiveCompute: 36 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Memset: 24 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: StreamShuffle: 8 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: CoreBarrier: 5 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Select: 1 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: BIRKernel: 1 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [Unroll]: Unrolled DGE count with Dynamic AP: 128 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: unroll finished after 0.843 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 652mb, ru_maxrss: 708mb (delta=383mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6506 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=6506 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6506 memory location(s), 1 block(s), and 11270 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=6506 blocks=1 instructions=11270 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.026 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 662mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3237 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:54 2025 + +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Total count: 19387 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Matmult: 15434 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: GenericCopy: 1644 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Load: 769 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Save: 365 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: TensorTensor: 311 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Activation: 235 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: TensorScalarPtr: 152 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Memset: 30 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: TensorReduce: 18 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: CollectiveCompute: 9 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Select: 6 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Iota: 5 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: unroll finished after 0.905 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 602mb, ru_maxrss: 708mb (delta=391mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7996 memory location(s), 1 block(s), and 19387 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=7996 blocks=1 instructions=19387 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o1 finished after 0.064 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 605mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3140 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:54 2025 + +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.049 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4189 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: sg0002 Instruction count after Unroll: +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Total count: 19377 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Matmult: 15434 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: GenericCopy: 1644 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Load: 769 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Save: 355 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: TensorTensor: 311 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Activation: 235 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: TensorScalarPtr: 152 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Gather: 131 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Max: 128 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Memset: 30 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: TensorReduce: 18 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: CoreBarrier: 13 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: CollectiveCompute: 9 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Select: 6 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Iota: 5 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: StreamShuffle: 4 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Reciprocal: 3 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: DMACopy: 2 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [Unroll]: Unrolled DGE count with Dynamic AP: 1 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: unroll finished after 0.959 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=391mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 7996 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=7996 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o1 finished after 0.035 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3741 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.998 seconds +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=391mb) +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=20028 blocks=6 instructions=71037 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:55Z USER 9099 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:55Z USER 9099 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:55Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=5721 blocks=2 instructions=11032 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=6377 blocks=2 instructions=22030 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=7930 blocks=2 instructions=37975 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:55Z USER 9099 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6369 memory location(s), 2 block(s), and 22030 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 7926 memory location(s), 2 block(s), and 37975 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5713 memory location(s), 2 block(s), and 11032 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: curr_vmrss: 610mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=20008 blocks=6 instructions=71037 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i32}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i33}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i34}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i35}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i36}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i37}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i38}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i39}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i40}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i41}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i42}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i43}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i44}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i45}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i46}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i47}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i48}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i49}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i50}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i51}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i52}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i53}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i54}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i55}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i56}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i57}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i58}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i59}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i60}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i61}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i62}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i63}@SB<0,0>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1272_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:55Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1277_i1}@SB<0,0>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.025 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 611mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.030 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 611mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.042 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 611mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.048 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 611mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.077 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 614mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.082 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.084 seconds +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=20008 blocks=6 instructions=71037 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:55Z USER 9099 (sg01) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:55Z USER 9099 (sg02) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:55Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=5713 blocks=2 instructions=11032 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=6369 blocks=2 instructions=22030 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=7926 blocks=2 instructions=37975 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (sg01) [SubgraphForkPass]: lnc_verifier finished after 0.004 seconds +2025-11-04T21:38:55Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 6369 memory location(s), 2 block(s), and 22030 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.005 seconds +2025-11-04T21:38:55Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 5713 memory location(s), 2 block(s), and 11032 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (sg02) [SubgraphForkPass]: lnc_verifier finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 7926 memory location(s), 2 block(s), and 37975 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:55Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=20008 blocks=6 instructions=71037 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: instruction_reorder finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: psum_legalization finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.007 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z WARNING 9099 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 16 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: psum_legalization finished after 0.010 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.009 seconds +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: instruction_reorder finished after 0.011 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: instruction_reorder finished after 0.013 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 384 bytes/partition +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z WARNING 9099 (nc01/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.003 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.002 seconds +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.007 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z WARNING 9099 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 616mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 16 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: psum_legalization finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: psum_legalization finished after 0.010 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.004 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.005 seconds +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: vn_splitter finished after 0.012 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 1 nodes. Total savings 384 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.006 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.005 seconds +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.013 seconds +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.014 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.028 seconds +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: legalize_cce_dma finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z WARNING 9099 (nc00/sg01) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: constant_propagate finished after 0.036 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: lower_ac finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 617mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [build_flow_deps]: Allocs: 2856 instructions: 5514 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.053 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z WARNING 9099 (nc01/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.017 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.008 seconds +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: vn_splitter finished after 0.034 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 16 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.060 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Found 2 Splits CCs +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: Grouped CCs to 1 clusters. +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: No split opportunities: +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: remat_optimization finished after 0.035 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.085 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: legalize_cce_dma finished after 0.003 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [build_flow_deps]: Allocs: 2857 instructions: 5518 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running pre_opts +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to pre_opts: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreOpts]: Skipped. No pre-opt passes enabled +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: pre_opts finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z WARNING 9099 (nc00/sg02) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Found 36 Splits CCs +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: Grouped CCs to 3 clusters. +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 23 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0.004 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.007 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.018 seconds +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: vn_splitter finished after 0.040 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 618mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 16852 edges +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [build_flow_deps]: Done build fdeps 16852 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.102 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 619mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 619mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2856 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2856 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 619mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2857 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 619mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2858 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2858 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 619mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2858 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2858 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 619mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2858 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2858 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: size = 763 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: End DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: constant_propagate finished after 0.081 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: found 2077 edges +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: mean: 5.4443 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: median: 6.99945 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 16616 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: lower_ac finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: lo = 763 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: total = 763 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 16855 edges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [build_flow_deps]: Done build fdeps 16855 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: input_dma_coalescing finished after 0.008 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.024 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2858 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2858 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [build_flow_deps]: Allocs: 3136 instructions: 10693 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.088 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.008 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2858 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2858 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: remat_optimization finished after 0.014 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.002 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: infer_stream_ids finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Found 36 Splits CCs +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: Grouped CCs to 3 clusters. +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:55Z INFO 9099 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.05 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.022 seconds +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: vn_splitter finished after 0.091 seconds +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Start split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: End split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 620mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 131 PSUM Banks +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: End remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Start DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.036 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 621mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2858 memory location(s), 1 block(s), and 5514 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2858 blocks=1 instructions=5514 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 67252996 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3667 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 42991616 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3168 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8486912 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.040 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 621mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2857 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=2857 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 621mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2858 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=2858 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 621mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2859 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=2859 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 621mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2859 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=2859 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 621mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2859 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=2859 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: size = 763 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: size = 2039 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: found 2077 edges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: mean: 5.4443 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: median: 6.99945 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 16616 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 35098 edges +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [build_flow_deps]: Done build fdeps 35098 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: found 295 accumulation groups +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: largest = custom-call.177.2034_i7 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: lo = 763 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: total = 763 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.016 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 622mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2859 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=2859 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: constant_propagate finished after 0.132 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 622mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.007 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 622mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2859 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=2859 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:55Z INFO 9099 []: find first defs for local +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: lower_ac finished after 0.006 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 622mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 []: find first defs for global +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1 PSUM Banks +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: pre_sched finished after 0.154 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 622mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3136 memory location(s), 1 block(s), and 10693 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3136 blocks=1 instructions=10693 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 4 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: End DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 141 remat count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.016 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 622mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 3 PSUM Banks +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Num intervals 2039 Num locations 2039 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [build_flow_deps]: Allocs: 3233 instructions: 11337 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: edge: 83914 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: mean: 82.309 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: median: 70.3318 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 131 PSUM Banks +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.034 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: safe = 1352 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: unsafe = 572 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: inf = 113 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: total = 2037 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2859 memory location(s), 1 block(s), and 5518 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=2859 blocks=1 instructions=5518 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 372 #Pinned 0 #Safe 0 minCost 0.00363194 maxCost 0.0725735 locations 2039 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: new candidates = 44 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: (including 1 infinite cost tensors) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.024 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 67252996 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3667 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 42991618 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3168 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8486912 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3132 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=3132 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3133 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=3133 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3134 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3134 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3134 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3134 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 623mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3134 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=3134 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Total: 2037 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Spilled: 0.002 (4) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Allocated: 0.998 (2033) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Rover zone: 0.685 (1393) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.020 (40) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.295 (600) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Blocks tall: 1.000 (2032) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.997 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: SB spills = 4 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: size = 8192 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: SB score = 24144 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: best SB heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: collect spills +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: size = 2040 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: remat_optimization finished after 0.033 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 624mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: size = 764 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: found 295 accumulation groups +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: largest = custom-call.177.2034_i2 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: found 1750 edges +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: mean: 4.58115 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: median: 4.30809 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: adjacency vectors require 14000 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:55Z INFO 9099 []: find first defs for local +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: insert spills +2025-11-04T21:38:55Z INFO 9099 []: find first defs for global +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: locationsToDelete done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: size = 2047 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: lo = 764 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: total = 764 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.026 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 628mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 141 remat count +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.046 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 628mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3134 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=3134 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.005 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 628mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Num intervals 2040 Num locations 2040 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: found 295 accumulation groups +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: largest = custom-call.177.2034_i7 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Found 3 Splits CCs +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: edge: 83940 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: mean: 82.2941 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: median: 70.0986 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 []: find first defs for local +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: safe = 1353 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: unsafe = 572 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: inf = 113 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: total = 2038 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 372 #Pinned 0 #Safe 0 minCost 0.00363194 maxCost 0.0725735 locations 2040 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.017 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 629mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: new candidates = 44 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: (including 1 infinite cost tensors) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3134 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=3134 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:55Z INFO 9099 []: find first defs for global +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Total: 2038 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Spilled: 0.002 (4) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Allocated: 0.998 (2034) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Rover zone: 0.685 (1394) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.020 (40) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.295 (600) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Blocks tall: 1.000 (2033) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.996 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: SB spills = 4 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: size = 8192 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: SB score = 24144 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: best SB heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: collect spills +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: insert spills +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: locationsToDelete done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: size = 2048 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 8 PSUM Banks +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: found 295 accumulation groups +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: largest = custom-call.177.2034_i2 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:55Z INFO 9099 []: find first defs for local +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 149 remat count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 []: find first defs for global +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 36765 edges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [build_flow_deps]: Done build fdeps 36765 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: End build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Start remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove_useless_insts +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Num intervals 2047 Num locations 2047 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: End remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 116 PSUM Banks +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: edge: 81622 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: mean: 79.7479 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: median: 67.8531 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 149 remat count +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: safe = 12 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: unsafe = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: total = 12 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 2047 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Total: 12 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (12) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Rover zone: 1.000 (12) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Blocks tall: 1.000 (12) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Num intervals 2048 Num locations 2048 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: constant_propagate finished after 0.203 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 629mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 24144 cycles +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: number of tensors spilled from SB = 4 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: total size of spilled tensors = 8192 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 18 PSUM Banks +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.066 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 629mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3134 memory location(s), 1 block(s), and 10689 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=3134 blocks=1 instructions=10689 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 161587716 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3496 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 20971520 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 4096 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4259840 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 69350148 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3581 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 44040192 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3127 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8486912 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.203 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 629mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: lower_ac finished after 0.009 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 629mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: edge: 81648 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: mean: 79.7344 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: median: 68.0681 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: End DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2870 memory location(s), 1 block(s), and 5526 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2870 blocks=1 instructions=5526 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: pre_sched finished after 0.242 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 629mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3233 memory location(s), 1 block(s), and 11337 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3233 blocks=1 instructions=11337 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: size = 2327 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: safe = 12 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: unsafe = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: total = 12 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 2048 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Total: 12 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (12) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Rover zone: 1.000 (12) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Blocks tall: 1.000 (12) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [build_flow_deps]: Allocs: 3739 instructions: 18598 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.017 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 629mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2870 memory location(s), 1 block(s), and 5526 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 4 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2870 blocks=1 instructions=5526 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: found 676 accumulation groups +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 113390340, 26.0201% input load, 8.32274% output write, 65.6572% spill/reload [sg0000] +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: largest = custom-call.182.1810_i13 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: input_dma_coalescing finished after 0.023 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 630mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 []: find first defs for local +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 24144 cycles +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: number of tensors spilled from SB = 4 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: total size of spilled tensors = 8192 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 69350148 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3581 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 44040194 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 3127 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 8486912 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 172 bytes +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.182 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 630mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2871 memory location(s), 1 block(s), and 5530 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 []: find first defs for global +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2871 blocks=1 instructions=5530 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.012 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: 307 remat count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 630mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2871 memory location(s), 1 block(s), and 5530 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=2871 blocks=1 instructions=5530 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 113390342, 26.0201% input load, 8.32274% output write, 65.6572% spill/reload [sg0000] +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Num intervals 2327 Num locations 2327 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.054 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 631mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3229 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=3229 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 631mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3230 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=3230 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 631mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3231 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3231 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 5242880, 4.62374% out of total dma traffic(2.95043e+07) +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 631mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3231 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3231 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 631mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3231 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=3231 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: edge: 195116 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: mean: 167.697 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: median: 123.613 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 25 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 25 spill/reload memory locations +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: remat_optimization finished after 0.060 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 632mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 5242880, 4.62374% out of total dma traffic(2.95043e+07) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: safe = 374 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: unsafe = 1373 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: inf = 578 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: total = 2325 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 1356 #Pinned 0 #Safe 0 minCost 0.0011279 maxCost 0.0700129 locations 2327 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: new candidates = 204 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: (including 1 infinite cost tensors) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 25 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 25 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 12058624, 16.1972% out of total spill/reload dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.010 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 632mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 12058624, 16.1972% out of total spill/reload dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: size = 828 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: infer_stream_ids finished after 0.010 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 633mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19377 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=4187 blocks=1 instructions=19377 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Found 3 Splits CCs +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: Grouped CCs to 2 clusters. +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: found 1974 edges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: mean: 4.76812 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: median: 4.93397 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: adjacency vectors require 15792 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: find costs +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 50394 edges +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [build_flow_deps]: Done build fdeps 50394 Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Total: 2325 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Spilled: 0.037 (86) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Allocated: 0.963 (2239) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Rover zone: 0.297 (664) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Pre-rover zone: 0.021 (48) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Post-rover zone: 0.682 (1527) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Blocks tall: 1.000 (2238) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.999 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Success +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: SB spills = 86 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: size = 138240 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: SB score = 666729 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: best SB heuristic = 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: collect spills +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:55Z INFO 9099 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: Start split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 8 SpillSaves and Reloads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 8 SpillSaves and Reloads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: insert spills +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 0 SpillSaves and Reloads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 53621508 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 42467330 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: Num_Splits: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: End split live ranges Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 17301504, 15.2584% out of total dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 96088838, 25.2489% input load, 9.82131% output write, 64.9298% spill/reload [sg0000] +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 53621508 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 0 SpillSaves and Reloads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 42467330 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 8486912 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 172 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1344 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.108 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 634mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2829 memory location(s), 1 block(s), and 5491 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2829 blocks=1 instructions=5491 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 53621508 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 42467328 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: locationsToDelete done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: lo = 828 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: hi = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: total = 828 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: simplify +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: select ranges +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: size = 2612 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 22 Sb address +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 17301504, 15.2584% out of total dma traffic +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 96088836, 25.2489% input load, 9.82131% output write, 64.9298% spill/reload [sg0000] +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 53621508 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3571 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 42467328 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3159 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 8486912 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 172 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 1344 bytes +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.157 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 635mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2828 memory location(s), 1 block(s), and 5487 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=2828 blocks=1 instructions=5487 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: remove_redundant_loads +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 53 Sb address +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 21 Sb address +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: no more spills +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: coloring_allocator_psum finished after 0.116 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 635mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3231 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=3231 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 53 Sb address +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: found 676 accumulation groups +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: largest = custom-call.182.1810_i13 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: End remove redundncies Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: Start DCE Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z INFO 9099 []: find first defs for local +2025-11-04T21:38:55Z INFO 9099 []: find first defs for global +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:55 2025 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: pre_sched finished after 0.306 seconds +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 636mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:55Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: dma_optimization_psum finished after 0.045 seconds +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 636mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3231 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:55Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=3231 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:55Z INFO 9099 (nc00/sg02) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 56 Sb address +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: 592 remat count +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: Num intervals 2612 Num locations 2612 +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:55Z INFO 9099 (nc01/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:55Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: edge: 151394 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: mean: 115.922 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: median: 84.6788 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 56 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 8 PSUM Banks +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 177 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: safe = 342 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: unsafe = 3 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: inf = 26 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: total = 371 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 3 #Pinned 0 #Safe 0 minCost 0.0129703 maxCost 0.0158031 locations 2612 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: new candidates = 3 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: (including 26 infinite cost tensors) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Total: 371 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Allocated: 1.000 (371) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Rover zone: 0.933 (346) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Pre-rover zone: 0.030 (11) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Post-rover zone: 0.038 (14) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Blocks tall: 1.000 (371) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: Success +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.118 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 638mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2829 memory location(s), 1 block(s), and 5491 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2829 blocks=1 instructions=5491 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: spilling from SB cost about 666729 cycles +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: number of tensors spilled from SB = 86 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: total size of spilled tensors = 138240 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 178 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: reserved space = 262400 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: spill space = 7864320 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 7864320 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: size = 15 +2025-11-04T21:38:56Z INFO 9099 []: find first defs for local +2025-11-04T21:38:56Z INFO 9099 []: find first defs for global +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: Num intervals 15 Num locations 15 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 211788292 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2561 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 38666240 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2397 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4259840 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.312 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3505 memory location(s), 1 block(s), and 11060 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3505 blocks=1 instructions=11060 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: End DCE Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: lo = 15 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: total = 15 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.138 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 7864320 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2828 memory location(s), 1 block(s), and 5487 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.024 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2829 memory location(s), 1 block(s), and 5491 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2829 blocks=1 instructions=5491 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=2828 blocks=1 instructions=5487 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 7864320 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 7864320 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.005 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2829 memory location(s), 1 block(s), and 5491 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2829 blocks=1 instructions=5491 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 72 out of 550 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2829 memory location(s), 1 block(s), and 5491 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2829 blocks=1 instructions=5491 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 116 PSUM Banks +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 583mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2829 memory location(s), 1 block(s), and 5492 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2829 blocks=1 instructions=5492 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 5492, number of allocs: 2829 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2751-0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.000439 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2751-0] +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: input0: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: input1: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: input2: [ 4 4096 128 ] +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: output0: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 0 +Memory Location: {reshape.16}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 0 +Memory Location: {reshape.24}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 4096 / 4096 = 1 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Scratch sbuf for kernel I-2751-0: [16384, 117692) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: reserved space = 262400 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: spill space = 7864320 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 7864320 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: size = 15 +2025-11-04T21:38:56Z INFO 9099 []: find first defs for local +2025-11-04T21:38:56Z INFO 9099 []: find first defs for global +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: Num intervals 15 Num locations 15 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [build_flow_deps]: Allocs: 4187 instructions: 19373 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: lo = 15 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: total = 15 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 7864320 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.032 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 593mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2828 memory location(s), 1 block(s), and 5487 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=2828 blocks=1 instructions=5487 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.132 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 594mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3739 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 7864320 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=3739 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 597mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3740 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=3740 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 597mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3741 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3741 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.046 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 599mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3505 memory location(s), 1 block(s), and 11060 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=3505 blocks=1 instructions=11060 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 250454532, 49.4457% input load, 0% output write, 50.5543% spill/reload [sg0001] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 18 PSUM Banks +2025-11-04T21:38:56Z USER 9099 (nc00/sg01) [ModuleForkPass]: address_rotation_psum finished after 0.130 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 7864320 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.011 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 599mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2828 memory location(s), 1 block(s), and 5487 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=2828 blocks=1 instructions=5487 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.004 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 600mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3741 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3741 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 602mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3231 memory location(s), 1 block(s), and 11333 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:56Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=3231 blocks=1 instructions=11333 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 602mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3741 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 72 out of 549 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=3741 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.004 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 602mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2828 memory location(s), 1 block(s), and 5487 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=2828 blocks=1 instructions=5487 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 161587716 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3496 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 54525954 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 4095 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4259840 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 606mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 2828 memory location(s), 1 block(s), and 5488 instruction(s). Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=2828 blocks=1 instructions=5488 Max writers: 64 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 5488, number of allocs: 2828 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2751-0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.000357 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Input/output shapes for Kernel inst [I-2751-0] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: input0: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: input1: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: input2: [ 4 4096 128 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: input3: ap +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: output0: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 2097152 +Memory Location: {reshape.16}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 2097152 +Memory Location: {reshape.24}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 4096 / 4096 = 1 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Scratch sbuf for kernel I-2751-0: [16384, 117692) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 0.211672 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: allocating SB +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: size = 2360 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.064 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 624mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5636 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=5636 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 625mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5636 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5636 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 625mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5636 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=5636 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: found 740 accumulation groups +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: largest = custom-call.182.1810_i5 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: size = 1338 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 []: find first defs for local +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:56Z INFO 9099 []: find first defs for global +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 1048576, 0.418669% out of total dma traffic(1.23839e+08) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: found 2203 edges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: mean: 3.29297 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: median: 2.95449 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: adjacency vectors require 17624 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 64 memorylocations +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.068 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 656mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5668 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=5668 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 62305 edges +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [build_flow_deps]: Done build fdeps 62305 Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: End build flow dependencies Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: Start remove useless insts Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: remove_useless_insts +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.007 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 661mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5668 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5668 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 0.437477 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.113 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 661mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5635 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=5635 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 661mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5635 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5635 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 661mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5635 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=5635 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: End remove useless insts Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 128 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 128 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 307 remat count +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Num intervals 2360 Num locations 2360 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 64 memorylocations +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.035 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 667mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5667 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 10 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 10 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=5667 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 667mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5667 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5667 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.053 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 668mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5668 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=5668 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: lo = 1264 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: total = 1338 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 668mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {I-2751-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5668 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=5668 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 7Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: edge: 202062 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: mean: 171.239 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: median: 130.62 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [build_flow_deps]: Allocs: 5668 instructions: 11383 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 24510464, 19.3582% out of total spill/reload dma traffic +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: safe = 374 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: unsafe = 1398 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: inf = 586 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: total = 2358 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 1381 #Pinned 0 #Safe 0 minCost 0.00111795 maxCost 0.0700129 locations 2360 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: new candidates = 200 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: (including 1 infinite cost tensors) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.217 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 672mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3741 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=3741 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: pre_sched finished after 0.481 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 672mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4187 memory location(s), 1 block(s), and 19373 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=4187 blocks=1 instructions=19373 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Total: 2358 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Spilled: 0.038 (89) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Allocated: 0.962 (2269) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Rover zone: 0.310 (703) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Pre-rover zone: 0.004 (8) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Post-rover zone: 0.687 (1558) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Blocks nothing: 0.000 (1) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Blocks tall: 1.000 (2268) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (mean): 0.998 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Success +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: SB spills = 89 tensors +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: size = 126976 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: SB score = 691515 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: best SB heuristic = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: collect spills +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.077 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 672mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5667 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=5667 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 673mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5667 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=5667 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 8Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [build_flow_deps]: Allocs: 5667 instructions: 11379 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: insert spills +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 63 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: locationsToDelete done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: main loop +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: size = 2666 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 29747 edges +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [build_flow_deps]: Done build fdeps 29747 Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.122 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 675mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5668 memory location(s), 1 block(s), and 11383 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=5668 blocks=1 instructions=11383 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.092 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 673mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3741 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=3741 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: find partners +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: found 740 accumulation groups +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: largest = custom-call.182.1810_i5 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: tensors = 17 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: requires 33280 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: expanding partners +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.017 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 673mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5656 memory location(s), 1 block(s), and 11355 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5656 blocks=1 instructions=11355 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 156 SpillSaves and Reloads +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 3254 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 3472 bytes +2025-11-04T21:38:56Z INFO 9099 []: find first defs for local +2025-11-04T21:38:56Z INFO 9099 []: find first defs for global +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 29744 edges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [build_flow_deps]: Done build fdeps 29744 Tue Nov 4 21:38:56 2025 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.112 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 686mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5667 memory location(s), 1 block(s), and 11379 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=5667 blocks=1 instructions=11379 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 40 SpillSaves and Reloads +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 3407 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 3923 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.021 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 687mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5655 memory location(s), 1 block(s), and 11351 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5655 blocks=1 instructions=11351 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: average loaded DMA size 3407 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: find loads +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: average saved DMA size 3923 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 186229252 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3407 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 38666240 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3923 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.183 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 687mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4124 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 2 pin count +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 613 remat count +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: build interference graph +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=4124 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 688mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4125 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=4125 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 688mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4126 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=4126 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.010 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 689mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4126 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=4126 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 25559040, 10.2051% out of total dma traffic +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 224895492, 54.5989% input load, 0% output write, 45.4011% spill/reload [sg0001] +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.006 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 691mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4126 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=4126 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 186229252 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3407 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Num intervals 2666 Num locations 2666 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 38666240 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3923 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4259840 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2355 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.445 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 693mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3187 memory location(s), 1 block(s), and 10792 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3187 blocks=1 instructions=10792 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.131 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 696mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: edge: 155283 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: mean: 116.491 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: median: 84.6333 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5656 memory location(s), 1 block(s), and 11355 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5656 blocks=1 instructions=11355 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: safe = 372 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: unsafe = 2 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: inf = 21 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: total = 395 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 2 #Pinned 0 #Safe 0 minCost 0.0133112 maxCost 0.0158031 locations 2666 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: new candidates = 2 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: (including 21 infinite cost tensors) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Total: 395 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Allocated: 1.000 (395) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Rover zone: 0.924 (365) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Pre-rover zone: 0.020 (8) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Post-rover zone: 0.056 (22) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Blocks tall: 1.000 (395) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: Success +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 128 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: main loop +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 61 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: spilling from SB cost about 691515 cycles +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: number of tensors spilled from SB = 89 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: total size of spilled tensors = 126976 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 5 PSUM Banks +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 210870788 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 2469 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 70778882 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 2864 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4259840 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 130 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: size = 1462 +2025-11-04T21:38:56Z USER 9099 (nc00/sg01) [ModuleForkPass]: coloring_allocator_sb finished after 0.501 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 697mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3626 memory location(s), 1 block(s), and 11728 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:56Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3626 blocks=1 instructions=11728 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.127 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 690mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5655 memory location(s), 1 block(s), and 11351 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5655 blocks=1 instructions=11351 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.068 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 685mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11227 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5511 blocks=1 instructions=11227 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: found 2265 edges +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: mean: 3.0985 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: median: 2.57601 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: adjacency vectors require 18120 bytes +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 128 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.242 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 685mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z USER 9099 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.027 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 685mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3741 memory location(s), 1 block(s), and 18598 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:56Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11227 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=3741 blocks=1 instructions=18598 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 306646160 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3273 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 25318912 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3929 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z USER 9099 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.048 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 685mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3626 memory location(s), 1 block(s), and 11728 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:56Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=3626 blocks=1 instructions=11728 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 281649670, 43.9692% input load, 5.95677% output write, 50.0741% spill/reload [sg0001] +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.055 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 684mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11223 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5510 blocks=1 instructions=11223 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: lo = 1388 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: total = 1462 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: size = 2350 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: sub-graph will get execute 27 times +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 1048576, 0.372298% out of total dma traffic(1.23839e+08) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: no more spills +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: coloring_allocator_psum finished after 0.168 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 685mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4126 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=4126 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:56Z USER 9099 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.035 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 684mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: found 1327 accumulation groups +2025-11-04T21:38:56Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11223 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: largest = _dot.199-t1198_i107 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 150 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 150 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 10 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 1]: removed 10 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: dma_optimization_psum finished after 0.035 seconds +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 685mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4126 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=4126 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:56Z INFO 9099 []: find first defs for local +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 2]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 25559040, 18.1227% out of total spill/reload dma traffic +2025-11-04T21:38:56Z INFO 9099 []: find first defs for global +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 62 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 165 Sb address +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: 554 remat count +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 62 PSUM Banks +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Num intervals 2350 Num locations 2350 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.304 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 690mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3187 memory location(s), 1 block(s), and 10792 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=3187 blocks=1 instructions=10792 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: edge: 73076 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: mean: 62.1923 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: median: 49.69 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: reserved space = 196608 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: spill space = 17694720 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 17694720 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: size = 37 +2025-11-04T21:38:56Z INFO 9099 []: find first defs for local +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:56Z INFO 9099 []: find first defs for global +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: safe = 1652 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: unsafe = 530 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: inf = 166 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: total = 2348 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 513 #Pinned 0 #Safe 0 minCost 0.00173444 maxCost 0.464629 locations 2350 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: new candidates = 131 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Total: 2348 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Spilled: 0.003 (6) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Allocated: 0.997 (2342) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Rover zone: 0.805 (1885) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Pre-rover zone: 0.016 (37) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Post-rover zone: 0.179 (420) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Blocks nothing: 0.015 (36) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Blocks medium: 0.003 (7) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.745 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.810 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.841 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Blocks tall: 0.982 (2299) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.855 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: Success +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: Num intervals 37 Num locations 37 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: SB spills = 6 tensors +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: size = 49152 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: SB score = 74682 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: best SB heuristic = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: collect spills +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: lo = 37 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: total = 37 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 14680064 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.057 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 689mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3187 memory location(s), 1 block(s), and 10792 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=3187 blocks=1 instructions=10792 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 14680064 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: insert spills +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 14680064 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.023 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 689mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3187 memory location(s), 1 block(s), and 10792 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=3187 blocks=1 instructions=10792 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [TensorCopyAccel::Impl]: Accelerated 40 out of 607 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.001 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 689mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3187 memory location(s), 1 block(s), and 10792 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=3187 blocks=1 instructions=10792 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:56Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 5 PSUM Banks +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: peephole_opts finished after 0.005 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 689mb, ru_maxrss: 708mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3187 memory location(s), 1 block(s), and 10793 instruction(s). Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=3187 blocks=1 instructions=10793 Max writers: 64 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 10793, number of allocs: 3187 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2512-0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Scan BKs time (s): 0.00258 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2512-0] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: input0: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: input1: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: input2: [ 4 4096 128 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: output0: [ 4 128 4096 ] +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 2097152 +Memory Location: {reshape.60}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 2097152 +Memory Location: {reshape.68}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 4096 / 4096 = 1 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: locationsToDelete done +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Scratch sbuf for kernel I-2512-0: [16384, 117692) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: size = 2362 +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 168 SpillSaves and Reloads +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 3234 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 3661 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: found 1327 accumulation groups +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: largest = _dot.199-t1198_i107 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:56Z INFO 9099 (nc01/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [LowerKernel]: Lower BKs time (s): 0.110915 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: lower_kernel finished after 0.035 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 715mb, ru_maxrss: 715mb (delta=7mb) +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 48 SpillSaves and Reloads +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 3379 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5994 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 3977 bytes +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=5994 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 []: find first defs for local +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.003 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 716mb, ru_maxrss: 716mb (delta=1mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5994 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5994 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 716mb, ru_maxrss: 716mb (delta=0mb) +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5994 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:56Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=5994 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:56Z INFO 9099 (nc01/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-11-04T21:38:56Z INFO 9099 []: find first defs for global +2025-11-04T21:38:56Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: average loaded DMA size 3379 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: average saved DMA size 3977 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 184263172 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3379 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 70778882 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 3977 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: address_rotation_psum finished after 0.254 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 719mb, ru_maxrss: 719mb (delta=11mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4126 memory location(s), 1 block(s), and 19310 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=4126 blocks=1 instructions=19310 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 26607616, 9.44706% out of total dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 255042054, 48.1452% input load, 6.57822% output write, 45.2766% spill/reload [sg0001] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 184263172 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3379 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 70778882 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 3977 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4259840 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 130 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 2467 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 307281564 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3259 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 25334281 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 3704 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: dma_optimization_sb finished after 0.336 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 718mb, ru_maxrss: 719mb (delta=11mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3276 memory location(s), 1 block(s), and 11433 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3276 blocks=1 instructions=11433 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 52 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: 566 remat count +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 65 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: allocating SB +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Num intervals 2362 Num locations 2362 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: size = 2601 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: edge: 68861 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: mean: 58.3074 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: median: 47.2596 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: found 1451 accumulation groups +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [NonSSALeg]: [Non-SSA legalization]created 64 memorylocations +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.074 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 725mb, ru_maxrss: 725mb (delta=9mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: largest = _dot.199-t1198_i21 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: safe = 5 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: unsafe = 3 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: inf = 10 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: total = 18 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 3 #Pinned 0 #Safe 0 minCost 0.0120452 maxCost 0.0360466 locations 2362 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: new candidates = 3 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6026 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: (including 10 infinite cost tensors) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=6026 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Total: 18 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Allocated: 1.000 (18) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Rover zone: 0.722 (13) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Post-rover zone: 0.278 (5) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Blocks tall: 1.000 (18) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: Success +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.005 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 726mb, ru_maxrss: 726mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6026 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6026 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i32}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i33}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i34}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i35}@SB<0,50688>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i36}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i37}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i38}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i39}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i40}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i41}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i42}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i43}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i44}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i45}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i46}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i47}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i48}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i49}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i50}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i51}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i52}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i53}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i54}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i55}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i56}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i57}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i58}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i59}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i60}@SB<0,16384>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i61}@SB<0,20480>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i62}@SB<0,16384>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {_dot.6-t1583_i63}@SB<0,24576>(128x4096)#Internal DebugInfo: <_dot.6||UNDEF||[128, 2048, 1]> +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: spilling from SB cost about 74682 cycles +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: number of tensors spilled from SB = 6 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: total size of spilled tensors = 49152 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z INFO 9099 []: find first defs for local +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 319229072 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3352 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 31610368 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 4383 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.421 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 725mb, ru_maxrss: 726mb (delta=18mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3759 memory location(s), 1 block(s), and 18616 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3759 blocks=1 instructions=18616 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 []: find first defs for global +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: birverifier finished after 0.051 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 728mb, ru_maxrss: 728mb (delta=2mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6026 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=6026 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.005 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 728mb, ru_maxrss: 728mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6026 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=6026 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.042 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 728mb, ru_maxrss: 728mb (delta=2mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3759 memory location(s), 1 block(s), and 18616 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 9Tue Nov 4 21:38:57 2025 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=3759 blocks=1 instructions=18616 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 133 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 564 remat count +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 350839440, 80.2296% input load, 0% output write, 19.7704% spill/reload [sg0002] +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [build_flow_deps]: Allocs: 6026 instructions: 16684 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Num intervals 2601 Num locations 2601 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: edge: 74549 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: mean: 57.3233 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: median: 45.5269 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: safe = 1901 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: unsafe = 532 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: inf = 166 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: total = 2599 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 513 #Pinned 0 #Safe 0 minCost 0.00173444 maxCost 0.464629 locations 2601 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: new candidates = 131 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Total: 2599 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Spilled: 0.002 (6) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Allocated: 0.998 (2593) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Rover zone: 0.803 (2083) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Pre-rover zone: 0.030 (78) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Post-rover zone: 0.165 (428) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Slice zone: 0.002 (4) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Blocks nothing: 0.049 (126) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Blocks medium: 0.007 (17) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (mean): 0.629 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (median): 0.693 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until medium blocking (p95): 0.862 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Blocks tall: 0.945 (2450) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (mean): 0.794 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (median): 0.988 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Success +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: SB spills = 6 tensors +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: size = 49152 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: SB score = 74682 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: best SB heuristic = 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: collect spills +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: insert spills +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: deleting loads #loadsToDelete: 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: deleting locs #locationsToDelete: 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: locationsToDelete done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: main loop +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: renumber locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: size = 2613 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 11542528, 3.28997% out of total dma traffic(2.81477e+08) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: find partners +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 150 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 48221 edges +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [build_flow_deps]: Done build fdeps 48221 Tue Nov 4 21:38:57 2025 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: build_fdeps finished after 0.106 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 738mb, ru_maxrss: 738mb (delta=10mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: found 1451 accumulation groups +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: largest = _dot.199-t1198_i21 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: tensors = 36 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: requires 49152 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: expanding partners +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6026 memory location(s), 1 block(s), and 16684 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=6026 blocks=1 instructions=16684 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 []: find first defs for local +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: remove_redundancies finished after 0.021 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 732mb, ru_maxrss: 738mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6014 memory location(s), 1 block(s), and 16656 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=6014 blocks=1 instructions=16656 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.251 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 732mb, ru_maxrss: 738mb (delta=19mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3276 memory location(s), 1 block(s), and 11433 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=3276 blocks=1 instructions=11433 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9099 []: find first defs for global +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 6291456, 9.07042% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: reserved space = 196608 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: spill space = 16252928 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 16252928 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: size = 35 +2025-11-04T21:38:57Z INFO 9099 []: find first defs for local +2025-11-04T21:38:57Z INFO 9099 []: find first defs for global +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: Num intervals 35 Num locations 35 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: lo = 35 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: total = 35 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 14155776 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram finished after 0.030 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 745mb, ru_maxrss: 745mb (delta=7mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3276 memory location(s), 1 block(s), and 11433 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=3276 blocks=1 instructions=11433 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: find loads +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 2 pin count +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 576 remat count +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: build interference graph +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm before rotation 14155776 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Num intervals 2613 Num locations 2613 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: edge: 70334 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: mean: 53.8339 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: median: 43.2751 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: find costs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DRAM hwm after rotation 14155776 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: address_rotation_dram finished after 0.038 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 753mb, ru_maxrss: 753mb (delta=8mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3276 memory location(s), 1 block(s), and 11433 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=3276 blocks=1 instructions=11433 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [TensorCopyAccel::Impl]: Accelerated 104 out of 672 tensorcopy in Function: sg0001 average acceleration factor: 1 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 754mb, ru_maxrss: 754mb (delta=1mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3276 memory location(s), 1 block(s), and 11433 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=3276 blocks=1 instructions=11433 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: safe = 5 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: unsafe = 3 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: inf = 10 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: total = 18 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 3 #Pinned 0 #Safe 0 minCost 0.0120452 maxCost 0.0360466 locations 2613 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: new candidates = 3 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: (including 10 infinite cost tensors) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Total: 18 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Allocated: 1.000 (18) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Rover zone: 0.722 (13) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Pre-rover zone: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Post-rover zone: 0.278 (5) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Blocks tall: 1.000 (18) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: Success +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: peephole_opts finished after 0.009 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 756mb, ru_maxrss: 756mb (delta=2mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3276 memory location(s), 1 block(s), and 11434 instruction(s). Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=3276 blocks=1 instructions=11434 Max writers: 64 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Start of kernel lowering pass, number of insts: 11434, number of allocs: 3276 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Found InstBIRKernel: [CausalAttentionMMSoftmaxMMWithoutSwap]I-2512-0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Scan BKs time (s): 0.000749 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Set architecture: gen3 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Input/output shapes for Kernel inst [I-2512-0] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: input0: [ 4 128 4096 ] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: input1: [ 4 128 4096 ] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: input2: [ 4 4096 128 ] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: input3: ap +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: output0: [ 4 128 4096 ] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: do_input1_tp=false +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: do_out_tp=true +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 0 +Memory Location: {reshape.60}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Legalized inp_ap=[[524288,4],[4096,128],[1,4096]] +Offset: 0 +Memory Location: {reshape.68}@DRAM(4194304x2)#Internal DebugInfo: +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: AP of Q indicates standalone Q tensor. +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: parallel_split_n = input1_ap[1].getStep() / input1_ap[2].getNum() = 4096 / 4096 = 1 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Sharding/tiling split_i=0, split_n=1 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Flash attention has been disabled +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Scratch sbuf for kernel I-2512-0: [16384, 117692) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.105 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 763mb, ru_maxrss: 763mb (delta=25mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6014 memory location(s), 1 block(s), and 16656 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=6014 blocks=1 instructions=16656 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: seq_len=4096, seq_len2=4096, complete_seq_len2=4096 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Creating identity matrices with AffineSelect +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [TensorCopyElim]: Tensor CP elimination: 128 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [LowerKernel]: Lower BKs time (s): 0.155995 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: lower_kernel finished after 0.041 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 782mb (delta=26mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6083 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=6083 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: lower_klir_kernel finished after 0.003 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 782mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6083 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=6083 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 782mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6083 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=6083 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.044 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 783mb, ru_maxrss: 783mb (delta=20mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5869 memory location(s), 1 block(s), and 16528 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5869 blocks=1 instructions=16528 Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z USER 9099 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.020 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 783mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5869 memory location(s), 1 block(s), and 16528 instruction(s). Max writers: 129 Max Readers: 1280 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: average loaded DMA size 3289 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: average saved DMA size 4383 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 301395088 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3289 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 31610368 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 4383 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [NonSSALeg]: [Non-SSA legalization]created 64 memorylocations +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: non_ssa_legalization finished after 0.058 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 783mb (delta=1mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 17833984, 5.08323% out of total dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 333005456, 81.0601% input load, 0% output write, 18.9399% spill/reload [sg0002] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6115 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=6115 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 301395088 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3289 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 31610368 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 4383 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3368 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.331 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 783mb (delta=55mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18586 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3728 blocks=1 instructions=18586 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.005 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 782mb, ru_maxrss: 783mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6115 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6115 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b0}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b1}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b2}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc00/sg01) Non - output memory location with no reader: {I-2512-0_s0_aten__mul_broadcast.7-t210_b3}@SB<0,34052>(128x4)#Internal DebugInfo: +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 184 Sb address +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: birverifier finished after 0.054 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 787mb, ru_maxrss: 787mb (delta=4mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6115 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=6115 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: dynamic_dma_scan finished after 0.004 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 787mb, ru_maxrss: 787mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6115 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=6115 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 10Tue Nov 4 21:38:57 2025 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: spilling from SB cost about 74682 cycles +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: number of tensors spilled from SB = 6 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: total size of spilled tensors = 49152 bytes/partition +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [build_flow_deps]: Allocs: 6115 instructions: 17325 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 319864476 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3338 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 31625737 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 4157 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 4100 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 241 bytes +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: coloring_allocator_sb finished after 0.533 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 788mb, ru_maxrss: 788mb (delta=69mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4144 memory location(s), 1 block(s), and 19328 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=4144 blocks=1 instructions=19328 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.029 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 786mb, ru_maxrss: 788mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4144 memory location(s), 1 block(s), and 19328 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=4144 blocks=1 instructions=19328 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 351490213, 80.171% input load, 1.13801e-06% output write, 19.829% spill/reload [sg0002] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 49932 edges +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [build_flow_deps]: Done build fdeps 49932 Tue Nov 4 21:38:57 2025 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: build_fdeps finished after 0.082 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 790mb, ru_maxrss: 790mb (delta=3mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6115 memory location(s), 1 block(s), and 17325 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=6115 blocks=1 instructions=17325 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 11538432, 3.28272% out of total dma traffic(2.81793e+08) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [RemoveRedundancies]: remove Useless Instructions: 28 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: remove_redundancies finished after 0.021 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 790mb, ru_maxrss: 790mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6103 memory location(s), 1 block(s), and 17297 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=6103 blocks=1 instructions=17297 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 6291456, 9.02688% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 36 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.104 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 810mb, ru_maxrss: 810mb (delta=20mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6103 memory location(s), 1 block(s), and 17297 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=6103 blocks=1 instructions=17297 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [TensorCopyElim]: Tensor CP elimination: 128 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 47 Sb address +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: tensor_copy_elim finished after 0.039 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 798mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5958 memory location(s), 1 block(s), and 17169 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5958 blocks=1 instructions=17169 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: average loaded DMA size 3275 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: average saved DMA size 4157 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 302034588 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3275 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 31625737 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 4157 bytes +2025-11-04T21:38:57Z USER 9099 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.015 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 798mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5958 memory location(s), 1 block(s), and 17169 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.329 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 798mb, ru_maxrss: 810mb (delta=27mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18586 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=3728 blocks=1 instructions=18586 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: spill space = 14680064 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 14680064 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: size = 22 +2025-11-04T21:38:57Z INFO 9099 []: find first defs for local +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 17829888, 5.07266% out of total dma traffic +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 333660325, 80.997% input load, 1.19882e-06% output write, 19.003% spill/reload [sg0002] +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 302034588 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3275 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 31625737 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 4157 bytes +2025-11-04T21:38:57Z INFO 9099 []: find first defs for global +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 4100 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 241 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3341 bytes +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: dma_optimization_sb finished after 0.247 seconds +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 810mb (delta=22mb) +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19299 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=4114 blocks=1 instructions=19299 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: Num intervals 22 Num locations 22 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: lo = 22 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: total = 22 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 14680064 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.035 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 800mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18586 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=3728 blocks=1 instructions=18586 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 14680064 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 14680064 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.017 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 796mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18586 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=3728 blocks=1 instructions=18586 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [TensorCopyAccel::Impl]: Accelerated 625 out of 1463 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 798mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18586 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=3728 blocks=1 instructions=18586 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: peephole_opts finished after 0.007 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 18592, number of allocs: 3728 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [LowerKernel]: Scan BKs time (s): 0.002368 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: lower_kernel finished after 0.002 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.010 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 798mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 207 Sb address +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.002 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 797mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {divide.1_1272_i1}@SB<32,16384>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:57Z WARNING 9099 [birverifier::InstVisitor]: (nc01/sg02) Non - output memory location with no reader: {select.5_1277_i1}@SB<96,17792>(1x1024)#Internal DebugInfo: +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: birverifier finished after 0.030 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 799mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.003 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 800mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 11Tue Nov 4 21:38:57 2025 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [build_flow_deps]: Allocs: 3728 instructions: 18592 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 50287 edges +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [build_flow_deps]: Done build fdeps 50287 Tue Nov 4 21:38:57 2025 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: build_fdeps finished after 0.056 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 807mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: remove_redundancies finished after 0.007 seconds +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 807mb, ru_maxrss: 810mb (delta=0mb) +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:57Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.094 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 835mb, ru_maxrss: 835mb (delta=25mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.037 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 815mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=3728 blocks=1 instructions=18592 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.015 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 815mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3728 memory location(s), 1 block(s), and 18592 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 137 Sb address +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.336 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 815mb, ru_maxrss: 835mb (delta=25mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19299 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=4114 blocks=1 instructions=19299 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: reserved space = 34824 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: spill space = 14687236 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 14708736 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: size = 29 +2025-11-04T21:38:58Z INFO 9099 []: find first defs for local +2025-11-04T21:38:58Z INFO 9099 []: find first defs for global +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: Num intervals 29 Num locations 29 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: lo = 29 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: total = 29 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 14680064 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram finished after 0.033 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 816mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19299 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=4114 blocks=1 instructions=19299 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm before rotation 14680064 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: allreduce hwm 8388608 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: Real CC buffer size 8388608 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DRAM hwm after rotation 14680064 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: DRAM Rotation rotated 5 Dram address +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: address_rotation_dram finished after 0.014 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 812mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19299 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=4114 blocks=1 instructions=19299 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [TensorCopyAccel::Impl]: Accelerated 625 out of 1601 tensorcopy in Function: sg0002 average acceleration factor: 1 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: tensorcopy_accel finished after 0.005 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 812mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19299 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=4114 blocks=1 instructions=19299 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: peephole_opts finished after 0.005 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 811mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [LowerKernel]: Start of kernel lowering pass, number of insts: 19305, number of allocs: 4114 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [LowerKernel]: Scan BKs time (s): 0.001049 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: lower_kernel finished after 0.001 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 811mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: lower_klir_kernel finished after 0.001 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 811mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: lower_nki_kernel finished after 0.001 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 811mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: non_ssa_legalization finished after 0.014 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 812mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.003 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 811mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: birverifier finished after 0.027 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 812mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: dynamic_dma_scan finished after 0.002 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 811mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 12Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [build_flow_deps]: Allocs: 4114 instructions: 19305 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 62135 edges +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [build_flow_deps]: Done build fdeps 62135 Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: build_fdeps finished after 0.045 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 817mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: remove_redundancies finished after 0.008 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 817mb, ru_maxrss: 835mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.091 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 843mb, ru_maxrss: 843mb (delta=8mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: tensor_copy_elim finished after 0.025 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 826mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=4114 blocks=1 instructions=19305 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.010 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 825mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4114 memory location(s), 1 block(s), and 19305 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 3.135 seconds +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: curr_vmrss: 825mb, ru_maxrss: 843mb (delta=135mb) +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=30690 blocks=6 instructions=94044 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=7842 blocks=2 instructions=37897 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=11021 blocks=2 instructions=22450 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=11827 blocks=2 instructions=33697 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 7842 memory location(s), 2 block(s), and 37897 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=7842 blocks=2 instructions=37897 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.003 seconds +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11021 memory location(s), 2 block(s), and 22450 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11827 memory location(s), 2 block(s), and 33697 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=11021 blocks=2 instructions=22450 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=11827 blocks=2 instructions=33697 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.007 seconds +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11021 memory location(s), 2 block(s), and 22458 instruction(s). Max writers: 129 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=11021 blocks=2 instructions=22458 Max writers: 129 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: lower_local_collectives finished after 0.010 seconds +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 7848 memory location(s), 2 block(s), and 37917 instruction(s). Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=7848 blocks=2 instructions=37917 Max writers: 298 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: lower_local_collectives finished after 0.014 seconds +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11891 memory location(s), 2 block(s), and 33833 instruction(s). Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=11891 blocks=2 instructions=33833 Max writers: 129 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.036 seconds +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11021 memory location(s), 2 block(s), and 22462 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.053 seconds +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 7848 memory location(s), 2 block(s), and 37921 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.054 seconds +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11891 memory location(s), 2 block(s), and 33837 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.079 seconds +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: curr_vmrss: 821mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94220 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:58Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:58Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=5901 blocks=1 instructions=16598 Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: reserved space = 8093952 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: spill space = 92274688 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 92274688 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: size = 12 +2025-11-04T21:38:58Z INFO 9099 []: find first defs for local +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: reserved space = 8093952 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: spill space = 92274688 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 92274688 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: reserved space = 14722060 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: spill space = 67427328 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: aligned spill space = 67469312 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: size = 20 +2025-11-04T21:38:58Z INFO 9099 []: find first defs for local +2025-11-04T21:38:58Z USER 9099 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.021 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 826mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: reserved space = 14712832 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: spill space = 67427328 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: aligned spill space = 67469312 bytes +2025-11-04T21:38:58Z INFO 9099 []: find first defs for global +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.026 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 826mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 []: find first defs for global +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: Num intervals 12 Num locations 12 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: lo = 12 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: total = 12 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 7864320 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 7864320 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 66584576 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 66584576 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 100139008 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:58Z USER 9099 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.041 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 826mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: reserved space = 16416768 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: spill space = 117440512 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: aligned spill space = 117440512 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: size = 13 +2025-11-04T21:38:58Z INFO 9099 []: find first defs for local +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: reserved space = 17858560 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: spill space = 117440512 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: aligned spill space = 117440512 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:58Z USER 9099 (nc01/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.045 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 822mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16598 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: Num intervals 20 Num locations 20 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: lo = 20 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: total = 20 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 14680064 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: Already used DRAM hwm: 14680064 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: allreduce_dram_hwm 48250880 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: Real CC buffer size 48250880 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: DRAM hwm after allocation: 65343488 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:58Z INFO 9099 []: find first defs for global +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.053 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 826mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: Num intervals 13 Num locations 13 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: lo = 13 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: total = 13 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: simplify +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: Already used DRAM hwm: 14155776 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: Already used DRAM hwm: 14155776 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: select ranges +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: allreduce_dram_hwm 81264640 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: Real CC buffer size 81264640 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: DRAM hwm after allocation: 114819072 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:58Z USER 9099 (nc00/sg01) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.077 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 824mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.081 seconds +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: curr_vmrss: 819mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94220 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=7848 blocks=2 instructions=37921 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=11021 blocks=2 instructions=22462 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=11891 blocks=2 instructions=33837 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (sg02) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 819mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 7848 memory location(s), 2 block(s), and 37921 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.003 seconds +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11021 memory location(s), 2 block(s), and 22462 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (sg01) [SubgraphForkPass]: sync_shared_allocations finished after 0.004 seconds +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11891 memory location(s), 2 block(s), and 33837 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.009 seconds +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94220 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:58Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:58Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=5901 blocks=1 instructions=16598 Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:58Z USER 9099 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.015 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.016 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.020 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.020 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.035 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.032 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 817mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16598 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:58Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.038 seconds +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: curr_vmrss: 817mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94220 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:58Z USER 9099 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:58Z INFO 9099 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=15142 blocks=3 instructions=46431 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=3 functions=3 allocs=15618 blocks=3 instructions=47789 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.139 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 831mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 15618 memory location(s), 3 block(s), and 47789 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.194 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 827mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 15142 memory location(s), 3 block(s), and 46431 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: nc_parallel_pass finished after 0.200 seconds +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: curr_vmrss: 823mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:58Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94220 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:58Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:58Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:58Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z USER 9099 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=5901 blocks=1 instructions=16598 Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:58Z USER 9099 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:58Z USER 9099 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 818mb, ru_maxrss: 843mb (delta=0mb) +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:58Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16598 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:58Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:58Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:58Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=5901 blocks=1 instructions=16598 Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:58 2025 +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:58Z INFO 9099 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware simulation time: 3265766 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:59 2025 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: post_sched finished after 0.241 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 902mb, ru_maxrss: 902mb (delta=59mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 900mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware simulation time: 3203734 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.013 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 900mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:59 2025 +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: post_sched finished after 0.268 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 896mb, ru_maxrss: 902mb (delta=59mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 895mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.014 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 895mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware simulation time: 123156774 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:59 2025 +2025-11-04T21:38:59Z USER 9099 (nc01/sg01) [ModuleForkPass]: post_sched finished after 0.400 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 899mb, ru_maxrss: 902mb (delta=59mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16598 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:59Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=5901 blocks=1 instructions=16598 Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:59Z USER 9099 (nc01/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.002 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 894mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16598 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:59Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5901 blocks=1 instructions=16598 Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:59Z USER 9099 (nc01/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.014 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 894mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16566 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware simulation time: 125475399 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:59 2025 +2025-11-04T21:38:59Z USER 9099 (nc00/sg01) [ModuleForkPass]: post_sched finished after 0.456 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 894mb, ru_maxrss: 902mb (delta=59mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z USER 9099 (nc00/sg01) [ModuleForkPass]: expand_scheduling_units finished after 0.003 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 887mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z USER 9099 (nc00/sg01) [ModuleForkPass]: dead_code_elim_o0 finished after 0.016 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware simulation time: 2905236 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Time-aware simulation time: 3066099 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:59 2025 +2025-11-04T21:38:59Z USER 9099 (nc01/sg02) [ModuleForkPass]: post_sched finished after 0.575 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 889mb, ru_maxrss: 902mb (delta=59mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc01/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.003 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 879mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc01/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.015 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 879mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18600 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z INFO 9099 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:59 2025 +2025-11-04T21:38:59Z USER 9099 (nc00/sg02) [ModuleForkPass]: post_sched finished after 0.594 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 879mb, ru_maxrss: 902mb (delta=59mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc00/sg02) [ModuleForkPass]: expand_scheduling_units finished after 0.003 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 876mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc00/sg02) [ModuleForkPass]: dead_code_elim_o0 finished after 0.015 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 876mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:38:59Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.617 seconds +2025-11-04T21:38:59Z INFO 9099 [BackendPassManager]: curr_vmrss: 876mb, ru_maxrss: 902mb (delta=59mb) +2025-11-04T21:38:59Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:59Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94184 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:59Z USER 9099 (sg01) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:59Z USER 9099 (sg02) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:59Z INFO 9099 (sg02) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=7848 blocks=2 instructions=37917 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=11021 blocks=2 instructions=22462 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 (sg01) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=11891 blocks=2 instructions=33805 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z USER 9099 (sg02) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:38:59Z INFO 9099 (sg02) [SubgraphForkPass]: curr_vmrss: 876mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z USER 9099 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:59Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 876mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (sg02) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 7848 memory location(s), 2 block(s), and 37917 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (sg01) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:38:59Z INFO 9099 (sg01) [SubgraphForkPass]: curr_vmrss: 876mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11021 memory location(s), 2 block(s), and 22462 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 (sg01) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 11891 memory location(s), 2 block(s), and 33805 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 3, Passed: 3, Failed: 0 +2025-11-04T21:38:59Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.007 seconds +2025-11-04T21:38:59Z INFO 9099 [BackendPassManager]: curr_vmrss: 876mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:59Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94184 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:59Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:59Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:59Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=3731 blocks=1 instructions=18600 Max writers: 299 Max Readers: 5818 +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5901 blocks=1 instructions=16566 Max writers: 130 Max Readers: 1280 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 698 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 701 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 124 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 124 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 786 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 784 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 853 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 63 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 64 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 939 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 900 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 948 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 67 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 195 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 174 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 203 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 203 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 67 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 171 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 156 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 160 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 849 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: PSUM Rotation rotated 856 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 32 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: PSUM Rotation rotated 232 PSUM Banks +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 42 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 42 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 73 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 65 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 39 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 541 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 540 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.366 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 883mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 80 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 29 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.371 seconds +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 881mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:59Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 254 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 311 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 97 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 180 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 180 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 104 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 5 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 58 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 42 Sb address +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.142 seconds +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 898mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:38:59Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 123 Sb address +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:59Z INFO 9099 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.152 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.538 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 883mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18600 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3731 blocks=1 instructions=18600 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 520 Sb address +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.051 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 537 Sb address +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 13Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [build_flow_deps]: Allocs: 5511 instructions: 11233 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.053 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 895mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 14Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [build_flow_deps]: Allocs: 5510 instructions: 11229 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: address_rotation_sb finished after 0.589 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 898mb, ru_maxrss: 902mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.642 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 917mb, ru_maxrss: 917mb (delta=15mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16566 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5901 blocks=1 instructions=16566 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 29380 edges +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [build_flow_deps]: Done build fdeps 29380 Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 29375 edges +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [build_flow_deps]: Done build fdeps 29375 Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: address_rotation_sb finished after 0.734 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 933mb, ru_maxrss: 933mb (delta=31mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.172 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 929mb, ru_maxrss: 933mb (delta=31mb) +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.161 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 926mb, ru_maxrss: 933mb (delta=31mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.207 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 915mb, ru_maxrss: 933mb (delta=31mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬─────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼─────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 64 │ 19914555392 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 114944 │ +│ Load │ ExternalInput -> Internal │ 55 │ 24146436 │ +│ Load │ Internal │ 352 │ 41943040 │ +│ Save │ Internal │ 199 │ 37224448 │ +│ Save │ Internal -> Output │ 35 │ 9437186 │ +└─────────────────┴────────────────────────────┴───────┴─────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 388 │ +│ 512 │ 1 │ +│ 1024 │ 32 │ +│ 2048 │ 128 │ +│ 4096 │ 153 │ +│ 1048576 │ 128 │ +│ 8388608 │ 4 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18600 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=3731 blocks=1 instructions=18600 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬─────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼─────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 64 │ 19914555392 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 114944 │ +│ Load │ ExternalInput -> Internal │ 55 │ 24146436 │ +│ Load │ Internal │ 352 │ 41943040 │ +│ Save │ Internal │ 199 │ 37224448 │ +│ Save │ Internal -> Output │ 34 │ 9437184 │ +└─────────────────┴────────────────────────────┴───────┴─────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 388 │ +│ 512 │ 1 │ +│ 1024 │ 32 │ +│ 2048 │ 128 │ +│ 4096 │ 153 │ +│ 1048576 │ 128 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 5569 #MatMult-Transposes 897 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 5569 #MatMult-Transposes 897 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ReportStats]: IO Tensor size combined: 458002948 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input1 │ ExternalInput │ int32 │ 16384 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate0 │ Output │ bfloat16 │ 16777216 │ +│ intermediate3 │ Output │ bfloat16 │ 16777216 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 8388608 │ +│ all_gather.1_i1 │ Internal │ bfloat16 │ 8388608 │ +│ all_gather.1_i0 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.24 │ Internal │ bfloat16 │ 8388608 │ +│ dot.4_i1 │ Internal │ bfloat16 │ 8388608 │ +│ dot.4_i0 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.16 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.29 │ Internal │ bfloat16 │ 8388608 │ +└─────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.005 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 914mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11229 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ReportStats]: IO Tensor size combined: 458002948 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output1 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input61 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input67 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input62 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input65 │ ExternalInput │ bfloat16 │ 2097152 │ +│ input1 │ ExternalInput │ int32 │ 16384 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate0 │ Output │ bfloat16 │ 16777216 │ +│ intermediate3 │ Output │ bfloat16 │ 16777216 │ +│ get_tuple_element.1 │ Internal │ bfloat16 │ 8388608 │ +│ all_gather.1_i1 │ Internal │ bfloat16 │ 8388608 │ +│ all_gather.1_i0 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.24 │ Internal │ bfloat16 │ 8388608 │ +│ dot.4_i1 │ Internal │ bfloat16 │ 8388608 │ +│ dot.4_i0 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.16 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.29 │ Internal │ bfloat16 │ 8388608 │ +└─────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.012 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 914mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.179 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 917mb, ru_maxrss: 933mb (delta=31mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.027 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 903mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18600 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=3731 blocks=1 instructions=18600 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [build_flow_deps]: Start build fdeps. Invocation: 15Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [build_flow_deps]: Allocs: 3731 instructions: 18600 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: anti_dependency_analyzer finished after 0.034 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dep_opt +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [build_flow_deps]: Start build fdeps. Invocation: 16Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [build_flow_deps]: Allocs: 4117 instructions: 19317 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [build_flow_deps]: Build fdeps inserted 50182 edges +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [build_flow_deps]: Done build fdeps 50182 Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.121 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 913mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.219 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 933mb (delta=16mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16566 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5901 blocks=1 instructions=16566 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [build_flow_deps]: Build fdeps inserted 60503 edges +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [build_flow_deps]: Done build fdeps 60503 Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: dep_opt finished after 0.093 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18600 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=3731 blocks=1 instructions=18600 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 2 │ 16777216 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 668 │ 269901836 │ +│ Load │ Internal │ 61 │ 31460484 │ +│ Save │ Internal │ 354 │ 31610368 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 193 │ +│ 4096 │ 567 │ +│ 8192 │ 12 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ReportStats]: MM Stats: #MatMults 15306 #MatMult-Transposes 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ReportStats]: IO Tensor size combined: 348938256 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input1 │ ExternalInput │ int32 │ 16384 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate84 │ Input │ bfloat16 │ 16777216 │ +│ convert.53 │ Internal │ bfloat16 │ 16777216 │ +│ intermediate83 │ Input │ bfloat16 │ 16777216 │ +│ add.9 │ Internal │ bfloat16 │ 16777216 │ +│ all_reduce.3_i0 │ Internal │ bfloat16 │ 8388608 │ +│ all_reduce.3_i1 │ Internal │ bfloat16 │ 8388608 │ +│ dot.14_i1 │ Internal │ bfloat16 │ 8388608 │ +│ dot.14_i0 │ Internal │ bfloat16 │ 8388608 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ add.9_pftranspose_1000-t1600_i9 │ Internal │ bfloat16 │ 1048576 │ +└─────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: report_stats finished after 0.007 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18600 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: dep_opt finished after 0.081 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running report_stats +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 4 │ 16777216 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 669 │ 269905932 │ +│ Load │ Internal │ 75 │ 31779720 │ +│ Save │ Internal │ 370 │ 31625733 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 209 │ +│ 2048 │ 1 │ +│ 4096 │ 568 │ +│ 8192 │ 12 │ +│ 9496 │ 2 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ReportStats]: MM Stats: #MatMults 15430 #MatMult-Transposes 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ReportStats]: IO Tensor size combined: 348938256 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input365 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input368 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input366 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input1 │ ExternalInput │ int32 │ 16384 │ +│ input370 │ ExternalInput │ bfloat16 │ 4096 │ +│ input367 │ ExternalInput │ bfloat16 │ 4096 │ +│ input3 │ ExternalInput │ float32 │ 12 │ +│ output0 │ ExternalOutput │ int32 │ 4 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────┼──────────┼──────────┼──────────────┤ +│ add.9 │ Internal │ bfloat16 │ 16777216 │ +│ intermediate84 │ Input │ bfloat16 │ 16777216 │ +│ intermediate83 │ Input │ bfloat16 │ 16777216 │ +│ convert.53 │ Internal │ bfloat16 │ 16777216 │ +│ all_reduce.3_i1 │ Internal │ bfloat16 │ 8388608 │ +│ all_reduce.3_i0 │ Internal │ bfloat16 │ 8388608 │ +│ dot.14_i1 │ Internal │ bfloat16 │ 8388608 │ +│ dot.14_i0 │ Internal │ bfloat16 │ 8388608 │ +│ DynamicDMAScratchLoc │ Internal │ uint8 │ 2097152 │ +│ -t3060 │ Internal │ float32 │ 1048576 │ +└──────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: report_stats finished after 0.006 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19317 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.038 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [build_flow_deps]: Start build fdeps. Invocation: 17Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [build_flow_deps]: Allocs: 5990 instructions: 17239 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: anti_dependency_analyzer finished after 0.064 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 891mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16566 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dep_opt +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=5901 blocks=1 instructions=16566 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [build_flow_deps]: Start build fdeps. Invocation: 18Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [build_flow_deps]: Allocs: 5901 instructions: 16566 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [build_flow_deps]: Build fdeps inserted 48078 edges +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [build_flow_deps]: Done build fdeps 48078 Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [build_flow_deps]: Build fdeps inserted 49894 edges +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [build_flow_deps]: Done build fdeps 49894 Tue Nov 4 21:39:00 2025 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: dep_opt finished after 0.134 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 890mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 32 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 269 │ 118497796 │ +│ Load │ Input -> Internal │ 8 │ 4194304 │ +│ Load │ Internal │ 435 │ 74055680 │ +│ Save │ Internal │ 235 │ 58195968 │ +│ Save │ Internal -> Output │ 33 │ 16777218 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 385 │ +│ 1024 │ 101 │ +│ 2048 │ 56 │ +│ 4096 │ 421 │ +│ 8192 │ 10 │ +│ 1048576 │ 128 │ +│ 8388608 │ 7 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ReportStats]: MM Stats: #MatMults 11248 #MatMult-Transposes 1792 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate5 │ Output │ bfloat16 │ 16777216 │ +│ intermediate3 │ Input │ bfloat16 │ 16777216 │ +│ intermediate6 │ Output │ bfloat16 │ 16777216 │ +│ intermediate0 │ Input │ bfloat16 │ 16777216 │ +│ add.4 │ Internal │ bfloat16 │ 16777216 │ +│ reshape.60 │ Internal │ bfloat16 │ 8388608 │ +│ dot.7_i0 │ Internal │ bfloat16 │ 8388608 │ +│ get_tuple_element.2 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.73 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.68 │ Internal │ bfloat16 │ 8388608 │ +└─────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: report_stats finished after 0.005 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 889mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17239 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: dep_opt finished after 0.114 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 889mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16566 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running report_stats +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=5901 blocks=1 instructions=16566 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 269 │ 118497796 │ +│ Load │ Input -> Internal │ 8 │ 4194304 │ +│ Load │ Internal │ 436 │ 76021760 │ +│ Save │ Internal │ 205 │ 42860544 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 385 │ +│ 1024 │ 103 │ +│ 2048 │ 54 │ +│ 4096 │ 356 │ +│ 8192 │ 14 │ +│ 1048576 │ 128 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ReportStats]: MM Stats: #MatMults 10736 #MatMult-Transposes 1280 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ReportStats]: IO Tensor size combined: 184558084 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output4 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input6 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input68 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input71 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input69 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input72 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input78 │ ExternalInput │ bfloat16 │ 4194304 │ +│ input76 │ ExternalInput │ bfloat16 │ 2097152 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────┼──────────┼──────────┼──────────────┤ +│ intermediate5 │ Output │ bfloat16 │ 16777216 │ +│ intermediate3 │ Input │ bfloat16 │ 16777216 │ +│ intermediate6 │ Output │ bfloat16 │ 16777216 │ +│ intermediate0 │ Input │ bfloat16 │ 16777216 │ +│ add.4 │ Internal │ bfloat16 │ 16777216 │ +│ reshape.60 │ Internal │ bfloat16 │ 8388608 │ +│ dot.7_i0 │ Internal │ bfloat16 │ 8388608 │ +│ get_tuple_element.2 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.73 │ Internal │ bfloat16 │ 8388608 │ +│ reshape.68 │ Internal │ bfloat16 │ 8388608 │ +└─────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: report_stats finished after 0.007 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16566 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 1.055 seconds +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=31mb) +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Inputs to assign_trigger_engine: modules=6 functions=6 allocs=30760 blocks=6 instructions=94184 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 233 DMA instructions. Moved 34 DMA instructions to CC's engines. +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 231 DMA instructions. Moved 32 DMA instructions to CC's engines. +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [AssignTriggerEngine]: Assigned trigger engine for 269 DMA instructions. Moved 34 DMA instructions to CC's engines. +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [AssignTriggerEngine]: Assigned trigger engine for 237 DMA instructions. Moved 32 DMA instructions to CC's engines. +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [AssignTriggerEngine]: Assigned trigger engine for 389 DMA instructions. Moved 19 DMA instructions to CC's engines. +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [AssignTriggerEngine]: Assigned trigger engine for 371 DMA instructions. Moved 17 DMA instructions to CC's engines. +2025-11-04T21:39:00Z INFO 9099 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: assign_trigger_engine finished after 0.048 seconds +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Output has 6 module(s), 6 function(s), 30760 memory location(s), 6 block(s), and 94184 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94184 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=3731 blocks=1 instructions=18600 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=4117 blocks=1 instructions=19317 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=5510 blocks=1 instructions=11229 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=5511 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=5990 blocks=1 instructions=17239 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=5901 blocks=1 instructions=16566 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.003 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.003 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11237 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.005 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.005 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: sync_before_global_cc finished after 0.005 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16570 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17243 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: sync_before_global_cc finished after 0.005 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19321 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.008 seconds +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=6 functions=6 allocs=30760 blocks=6 instructions=94208 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: assign_hwdge_engine finished after 0.013 seconds +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Output has 6 module(s), 6 function(s), 30760 memory location(s), 6 block(s), and 94208 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94208 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=4117 blocks=1 instructions=19321 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=5511 blocks=1 instructions=11237 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=5510 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=5990 blocks=1 instructions=17243 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=5901 blocks=1 instructions=16570 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 192 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 356 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 278 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 199 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.002 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=5510 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 2 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 192 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 441 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 404 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 205 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 3 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 224 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 472 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 438 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 203 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 10 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 39 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 10 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 301 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 8 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 44 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 662 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 54 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 4 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 1 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 192 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 356 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 280 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 199 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 22 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 3 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 298 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 1 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 39 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 662 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 54 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: alloc_queues finished after 0.003 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16570 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19321 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17243 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11237 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=4117 blocks=1 instructions=19321 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=5901 blocks=1 instructions=16570 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=5990 blocks=1 instructions=17243 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=5511 blocks=1 instructions=11237 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.004 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.004 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11237 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.005 seconds +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: chain_dma_transposes finished after 0.005 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19321 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.007 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: chain_dma_transposes finished after 0.007 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16570 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17243 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.013 seconds +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94208 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:39:00Z USER 9099 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:39:00Z INFO 9099 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=15618 blocks=3 instructions=47801 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=3 functions=3 allocs=15142 blocks=3 instructions=46407 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00) [CoreForkPass]: Output has 3 module(s), 3 function(s), 15618 memory location(s), 3 block(s), and 47801 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01) [CoreForkPass]: Output has 3 module(s), 3 function(s), 15142 memory location(s), 3 block(s), and 46407 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: nc_parallel_pass finished after 0.004 seconds +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94208 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=4117 blocks=1 instructions=19321 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=5511 blocks=1 instructions=11237 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=5510 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=5990 blocks=1 instructions=17243 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=5901 blocks=1 instructions=16570 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19321 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running lower_control +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11237 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=4117 blocks=1 instructions=19321 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17243 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16570 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running lower_control +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=5511 blocks=1 instructions=11237 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=5510 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=5990 blocks=1 instructions=17243 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=5901 blocks=1 instructions=16570 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.018 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.018 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11237 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=5511 blocks=1 instructions=11237 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=5510 blocks=1 instructions=11233 Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: lower_control finished after 0.026 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: lower_control finished after 0.027 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=3731 blocks=1 instructions=18604 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19321 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=4117 blocks=1 instructions=19321 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Start Dependency Reduction +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: lower_control finished after 0.028 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16570 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=5901 blocks=1 instructions=16570 Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: lower_control finished after 0.030 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 888mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17243 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: Running dep_reduction +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=5990 blocks=1 instructions=17243 Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Start Dependency Reduction +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 15777 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 15779 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Processing async instrs... +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 16962 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 16962 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Processing async instrs... +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 16962 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 16962 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 19398 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 19920 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 23309 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 24115 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 20504 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 20504 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Processing redundant descendants, Done. Num edges removed 21387 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Processing async instrs, Done. Num edges removed 21387 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 24733 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 24733 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Processing redundant descendants, Done. Num edges removed 25715 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Processing async instrs, Done. Num edges removed 25715 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Finished dependency reduction: 58965 removed, new total 5542 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:39:00Z USER 9099 (nc00/sg00) [ModuleForkPass]: dep_reduction finished after 0.143 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 928mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Finished dependency reduction: 59025 removed, new total 5523 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:39:00Z USER 9099 (nc01/sg00) [ModuleForkPass]: dep_reduction finished after 0.144 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 928mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5511 memory location(s), 1 block(s), and 11237 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5510 memory location(s), 1 block(s), and 11233 instruction(s). Max writers: 130 Max Readers: 896 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Finished dependency reduction: 90466 removed, new total 4870 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:39:00Z USER 9099 (nc01/sg02) [ModuleForkPass]: dep_reduction finished after 0.203 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: curr_vmrss: 932mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 3731 memory location(s), 1 block(s), and 18604 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Finished dependency reduction: 94559 removed, new total 6205 +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:39:00Z USER 9099 (nc01/sg01) [ModuleForkPass]: dep_reduction finished after 0.217 seconds +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: curr_vmrss: 932mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc01/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5901 memory location(s), 1 block(s), and 16570 instruction(s). Max writers: 130 Max Readers: 1280 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Num Async removed: 0 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Finished dependency reduction: 103032 removed, new total 5683 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:39:00Z USER 9099 (nc00/sg02) [ModuleForkPass]: dep_reduction finished after 0.228 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: curr_vmrss: 928mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg02) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 4117 memory location(s), 1 block(s), and 19321 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Num Async removed: 0 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Finished dependency reduction: 101071 removed, new total 6549 +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:39:00Z USER 9099 (nc00/sg01) [ModuleForkPass]: dep_reduction finished after 0.237 seconds +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: curr_vmrss: 927mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z INFO 9099 (nc00/sg01) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5990 memory location(s), 1 block(s), and 17243 instruction(s). Max writers: 130 Max Readers: 1792 +2025-11-04T21:39:00Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 6, Passed: 6, Failed: 0 +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.282 seconds +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: curr_vmrss: 921mb, ru_maxrss: 933mb (delta=0mb) +2025-11-04T21:39:00Z USER 9099 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:39:00Z INFO 9099 [BackendPassManager]: Inputs to nc_parallel_pass: modules=6 functions=6 allocs=30760 blocks=6 instructions=94208 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z USER 9099 (nc00) [CoreForkPass]: Running bir_linker +2025-11-04T21:39:00Z USER 9099 (nc01) [CoreForkPass]: Running bir_linker +2025-11-04T21:39:00Z INFO 9099 (nc01) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=15142 blocks=3 instructions=46407 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc01/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:39:00Z INFO 9099 (nc00) [CoreForkPass]: Inputs to bir_linker: modules=3 functions=3 allocs=15618 blocks=3 instructions=47801 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:00Z INFO 9099 (nc00/sgLnk) [BirLinker]: bir_linker cwd: +2025-11-04T21:39:00Z INFO 9099 (nc01/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:39:00Z INFO 9099 (nc01/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:39:00Z INFO 9099 (nc01/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:39:00Z INFO 9099 (nc00/sgLnk) [BirLinker]: Num intermediates 86 +2025-11-04T21:39:00Z INFO 9099 (nc00/sgLnk) [BirLinker]: Num Module Definitions 3 +2025-11-04T21:39:00Z INFO 9099 (nc00/sgLnk) [BirLinker]: Linking to a call-graph structure +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: Added a new SpillReload Que qSPPIOParam0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l/nc01/sgLnk/sg00/tensor_map.json +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [BirLinker]: PostLink Stats: #MatMults 310747 #MatMult-Transposes 41275 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [BirLinker]: Total Intermediate MMTs 1728 #out: 0 #inp: 1728 #symmetric: 0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: tensor_map verification successful. +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: Writing updated tensor_map /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l/nc00/sgLnk/sg00/tensor_map.json +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: PostLink Stats: #MatMults 324695 #MatMult-Transposes 55099 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: Total Intermediate MMTs 1728 #out: 0 #inp: 1728 #symmetric: 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: Total Intermediate IOs with MMTs: 2 #out: 0 #inp: 2 #both: 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: releasing pre-link modules +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: bir_linker finished after 0.653 seconds +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1372mb, ru_maxrss: 1372mb (delta=439mb) +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 687544216, 60.6487% input load, 1.37259% output write, 37.9787% spill/reload +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: postlnk_dma_report finished after 0.009 seconds +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: Running report_stats +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬─────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼─────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 64 │ 19914555392 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 114944 │ +│ Load │ ExternalInput -> Internal │ 55 │ 24146436 │ +│ Load │ Internal │ 352 │ 41943040 │ +│ Save │ Internal │ 199 │ 37224448 │ +│ Save │ Internal -> Output │ 34 │ 9437184 │ +└─────────────────┴────────────────────────────┴───────┴─────────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 388 │ +│ 512 │ 1 │ +│ 1024 │ 32 │ +│ 2048 │ 128 │ +│ 4096 │ 153 │ +│ 1048576 │ 128 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 269 │ 118497796 │ +│ Load │ Input -> Internal │ 8 │ 4194304 │ +│ Load │ Internal │ 436 │ 76021760 │ +│ Save │ Internal │ 205 │ 42860544 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 4 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 385 │ +│ 1024 │ 103 │ +│ 2048 │ 54 │ +│ 4096 │ 356 │ +│ 8192 │ 14 │ +│ 1048576 │ 128 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 2 │ 16777216 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 668 │ 269901836 │ +│ Load │ Internal │ 61 │ 31460484 │ +│ Save │ Internal │ 354 │ 31610368 │ +└─────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 4 │ +│ 32 │ 2 │ +│ 128 │ 2 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 193 │ +│ 4096 │ 567 │ +│ 8192 │ 12 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: MM Stats: #MatMults 31611 #MatMult-Transposes 7995 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: IO Tensor size combined: 6781492268 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 16777216 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: report_stats finished after 0.017 seconds +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:39:01Z INFO 9099 []: find first defs for local +2025-11-04T21:39:01Z INFO 9099 []: find first defs for global +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: Real CC buffer size 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.056 seconds +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: spill space = 941621304 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: aligned spill space = 941735936 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.054 seconds +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [BirLinker]: linking Done. +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: bir_linker finished after 0.883 seconds +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=439mb) +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: Running postlnk_dma_report +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Inputs to postlnk_dma_report: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DMAReport]: DMA Report: Bytes loaded or saved 718345649, 58.0927% input load, 3.64927% output write, 38.258% spill/reload +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: postlnk_dma_report finished after 0.009 seconds +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: Running report_stats +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Inputs to report_stats: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: main +┌─────────────┬──────┬───────┬───────┐ +│ Instruction │ Kind │ Count │ Bytes │ +└─────────────┴──────┴───────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────────┬────────────────────────────┬───────┬─────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼─────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 64 │ 19914555392 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 114944 │ +│ Load │ ExternalInput -> Internal │ 55 │ 24146436 │ +│ Load │ Internal │ 352 │ 41943040 │ +│ Save │ Internal │ 199 │ 37224448 │ +│ Save │ Internal -> Output │ 35 │ 9437186 │ +└─────────────────┴────────────────────────────┴───────┴─────────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 1 │ +│ 128 │ 1 │ +│ 256 │ 388 │ +│ 512 │ 1 │ +│ 1024 │ 32 │ +│ 2048 │ 128 │ +│ 4096 │ 153 │ +│ 1048576 │ 128 │ +│ 8388608 │ 4 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0001 +┌─────────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 32 │ 0 │ +│ DMACopy │ Internal -> ExternalOutput │ 128 │ 4294967296 │ +│ DMACopy │ Internal -> Output │ 2 │ 33554432 │ +│ DMACopy (Spill) │ Internal │ 192 │ 0 │ +│ Load │ Const -> Internal │ 5 │ 98304 │ +│ Load │ ExternalInput -> Internal │ 269 │ 118497796 │ +│ Load │ Input -> Internal │ 8 │ 4194304 │ +│ Load │ Internal │ 435 │ 74055680 │ +│ Save │ Internal │ 235 │ 58195968 │ +│ Save │ Internal -> Output │ 33 │ 16777218 │ +└─────────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 5 │ +│ 4 │ 1 │ +│ 32 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 385 │ +│ 1024 │ 101 │ +│ 2048 │ 56 │ +│ 4096 │ 421 │ +│ 8192 │ 10 │ +│ 1048576 │ 128 │ +│ 8388608 │ 7 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: Data Movement Statistics: sg0002 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ Input -> Internal │ 1 │ 25165824 │ +│ DMACopy │ Internal │ 4 │ 16777216 │ +│ Load │ Const -> Internal │ 8 │ 348936 │ +│ Load │ ExternalInput -> Internal │ 669 │ 269905932 │ +│ Load │ Internal │ 75 │ 31779720 │ +│ Save │ Internal │ 370 │ 31625733 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 4 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 1 │ 1 │ +│ 2 │ 1 │ +│ 4 │ 9 │ +│ 8 │ 2 │ +│ 16 │ 3 │ +│ 32 │ 6 │ +│ 64 │ 2 │ +│ 128 │ 4 │ +│ 256 │ 1 │ +│ 384 │ 1 │ +│ 512 │ 302 │ +│ 1024 │ 209 │ +│ 2048 │ 1 │ +│ 4096 │ 568 │ +│ 8192 │ 12 │ +│ 9496 │ 2 │ +│ 8388608 │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: MM Stats: #MatMults 32247 #MatMult-Transposes 8507 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: IO Tensor size combined: 6781492268 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ input60_sg0000 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369_sg0002 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output3 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ output2 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output7 │ ExternalOutput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ output11 │ ExternalOutput │ bfloat16 │ 33554432 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────┬───────────────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────┼───────────────────┼──────────┼──────────────┤ +│ intermediate3 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate0 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate20 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate11 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate5 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate14 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate26 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate23 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate17 │ InternalInterface │ bfloat16 │ 16777216 │ +│ intermediate8 │ InternalInterface │ bfloat16 │ 16777216 │ +└─────────────────┴───────────────────┴──────────┴──────────────┘ + +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: report_stats finished after 0.016 seconds +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: Running coloring_allocator_dram_post_lnk +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Local +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: size = 0 +2025-11-04T21:39:01Z INFO 9099 []: find first defs for local +2025-11-04T21:39:01Z INFO 9099 []: find first defs for global +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: lo = 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: total = 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: coloring_allocator_dram_post_lnk finished after 0.056 seconds +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: Running coloring_allocator_dram_shared_post_lnk +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Inputs to coloring_allocator_dram_shared_post_lnk: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: allocating spills in DRAM post_link mode for address space Shared +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: reserved space = 0 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: spill space = 941621304 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: aligned spill space = 941735936 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: renumber locations +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: size = 86 +2025-11-04T21:39:01Z INFO 9099 []: find first defs for local +2025-11-04T21:39:01Z INFO 9099 []: find first defs for global +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: Num intervals 86 Num locations 86 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: initialize low and high +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: lo = 86 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: hi = 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: total = 86 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: simplify +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: Already used DRAM hwm: 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: select ranges +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: allreduce_dram_hwm 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: Real CC buffer size 114819072 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: DRAM hwm after allocation: 184037376 +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:01Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: coloring_allocator_dram_shared_post_lnk finished after 0.052 seconds +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 6, Failed: 0 +2025-11-04T21:39:01Z USER 9099 [BackendPassManager]: nc_parallel_pass finished after 1.031 seconds +2025-11-04T21:39:01Z INFO 9099 [BackendPassManager]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=439mb) +2025-11-04T21:39:01Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:39:01Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=31788 blocks=8 instructions=94292 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:39:01Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=8 allocs=31788 blocks=8 instructions=94292 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.003 seconds +2025-11-04T21:39:01Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 31788 memory location(s), 8 block(s), and 94292 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:39:01Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.008 seconds +2025-11-04T21:39:01Z INFO 9099 [BackendPassManager]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z USER 9099 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:39:01Z INFO 9099 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=8 allocs=31788 blocks=8 instructions=94292 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_post_lnk +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_post_lnk: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.042 seconds +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:39:01Z INFO 9099 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_post_lnk finished after 0.046 seconds +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:01Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:01Z USER 9099 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.011 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.011 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.026 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47843 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=16132 blocks=4 instructions=47843 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 1 DGE instructions +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 1 DGE instructions were scanned +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.025 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46449 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=4 allocs=15656 blocks=4 instructions=46449 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.007 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47850 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=16132 blocks=4 instructions=47850 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [OptimizeQueueSwitch]: Optimize queue switch has replaced 7 total SQI Instructions with RQI +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.006 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46456 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=4 allocs=15656 blocks=4 instructions=46456 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 8028/8028 (100% DGE) + power-of-2 partition : 8029/8064 (99.566% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 8030/8065 (99.566% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/171 (98.8304% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/171 (98.8304% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 18090/18106 (99.9116% DGE) + power-of-2 partition : 18090/18414 (98.2405% DGE) + > 3 dimensional : 0/16 (0% DGE) + non-integer desc size : 0/0 + total : 18090/18414 (98.2405% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 29 + Transpose : 5376 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 3648/3648 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: lower_dma finished after 0.163 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46456 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=15656 blocks=4 instructions=46456 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: expand_all_engine finished after 0.013 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46456 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=15656 blocks=4 instructions=46456 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 8892/8892 (100% DGE) + power-of-2 partition : 8949/9014 (99.2789% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 8950/9015 (99.279% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 169/171 (98.8304% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 169/171 (98.8304% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 18878/18894 (99.9153% DGE) + power-of-2 partition : 18878/19234 (98.1491% DGE) + > 3 dimensional : 0/16 (0% DGE) + non-integer desc size : 0/0 + total : 18878/19234 (98.1491% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 893 + Transpose : 5376 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 1/1 (100% DGE) + vector : 3648/3648 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: lower_dma finished after 0.184 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47850 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=4 allocs=16132 blocks=4 instructions=47850 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: expand_all_engine finished after 0.014 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47850 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=4 allocs=16132 blocks=4 instructions=47850 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.062 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46456 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=15656 blocks=4 instructions=46456 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.065 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47850 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=4 allocs=16132 blocks=4 instructions=47850 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: expand_inst_late finished after 0.066 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 47043 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=15656 blocks=4 instructions=47043 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [SeqInstOpt]: Removing 320 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [SeqInstOpt]: Removing 257 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.009 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 46466 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=15656 blocks=4 instructions=46466 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: expand_inst_late finished after 0.075 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 48437 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=4 allocs=16132 blocks=4 instructions=48437 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [SeqInstOpt]: Removing 320 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [SeqInstOpt]: Removing 257 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: lower_sync finished after 0.031 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 49837 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.009 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=15656 blocks=4 instructions=49837 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 47860 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=4 allocs=16132 blocks=4 instructions=47860 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: lower_act finished after 0.009 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 49864 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=15656 blocks=4 instructions=49864 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: lower_sync finished after 0.034 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 51549 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=4 allocs=16132 blocks=4 instructions=51549 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: lower_act finished after 0.010 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 51577 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=4 allocs=16132 blocks=4 instructions=51577 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: lower_dve finished after 0.184 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1214mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 49864 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=15656 blocks=4 instructions=49864 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: lower_ap finished after 0.016 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1214mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 49864 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=15656 blocks=4 instructions=49864 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:39:02Z INFO 9099 []: find first defs for local reg +2025-11-04T21:39:02Z INFO 9099 []: find first defs for global reg +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: lower_dve finished after 0.190 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1214mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 51577 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=4 allocs=16132 blocks=4 instructions=51577 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: lower_ap finished after 0.017 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 51577 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=4 allocs=16132 blocks=4 instructions=51577 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:39:02Z INFO 9099 []: find first defs for local reg +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:39:02Z INFO 9099 []: find first defs for local reg +2025-11-04T21:39:02Z INFO 9099 []: find first defs for global reg +2025-11-04T21:39:02Z INFO 9099 []: find first defs for global reg +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: size = 3 +2025-11-04T21:39:02Z INFO 9099 []: find first defs for local reg +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 []: find first defs for global reg +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:39:02Z INFO 9099 []: find first defs for local reg +2025-11-04T21:39:02Z INFO 9099 []: find first defs for global reg +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: lo = 3 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: total = 3 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:39:02Z USER 9099 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.146 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01) [CoreForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 49864 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: allocating REG +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: renumber registers +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: size = 4 +2025-11-04T21:39:02Z INFO 9099 []: find first defs for local reg +2025-11-04T21:39:02Z INFO 9099 []: find first defs for global reg +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: live range analysis +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: find costs +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: simplify interference graph +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: initialize low and high +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: lo = 4 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: hi = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: inf = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: total = 4 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: simplify +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: new candidates = 0 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: select ranges +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: no more spills +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:39:02Z USER 9099 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.136 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00) [CoreForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 51577 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: nc_parallel_pass finished after 0.846 seconds +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: vnc_remote_addr_map finished after 0.005 seconds +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: Output has 2 module(s), 8 function(s), 31788 memory location(s), 8 block(s), and 101441 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: Running vnc_link +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 [VncLink]: Found 0 remote updates +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: vnc_link finished after 0.002 seconds +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: Output has 2 module(s), 8 function(s), 31788 memory location(s), 8 block(s), and 101441 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:39:02Z USER 9099 (nc01/sgLnk) [ModuleForkPass]: Running birverifier +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=16132 blocks=4 instructions=51577 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=4 allocs=15656 blocks=4 instructions=49864 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00/sgLnk) [ModuleForkPass]: birverifier finished after 0.107 seconds +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 51577 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc01/sgLnk) [ModuleForkPass]: birverifier finished after 0.118 seconds +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 49864 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.122 seconds +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:39:02Z INFO 9099 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.019 seconds +2025-11-04T21:39:02Z INFO 9099 (sg00) [SubgraphForkPass]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z INFO 9099 (sg00) [SubgraphForkPass]: Output has 2 module(s), 8 function(s), 31788 memory location(s), 8 block(s), and 101441 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: subgraph_parallel_pass finished after 0.024 seconds +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: curr_vmrss: 1215mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:02Z USER 9099 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:39:02Z INFO 9099 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z USER 9099 (nc00/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:39:02Z USER 9099 (nc01/sgLnk) [ModuleForkPass]: Running codegen +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=15656 blocks=4 instructions=49864 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [ModuleForkPass]: Inputs to codegen: modules=1 functions=4 allocs=16132 blocks=4 instructions=51577 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:39:02Z INFO 9099 (nc01/sgLnk) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 1.89235 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.000685696 │ +└────────────────┴─────────────┘ + +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:39:02Z INFO 9099 (nc00/sgLnk) [Codegen]: +┌────────────────┬────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼────────────┤ +│ ExternalInput │ 1.89235 │ +│ ExternalOutput │ 1.75 │ +│ Const │ 0.00068761 │ +└────────────────┴────────────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 32171 │ +│ LDWEIGHTS │ 31710 │ +│ EVENT_SEMAPHORE │ 3371 │ +│ ACTIVATE │ 3203 │ +│ UNKNOWN(0xd4) │ 2638 │ +│ TENSOR_TENSOR │ 2397 │ +│ CAST │ 1446 │ +│ UNKNOWN(0x9a) │ 1152 │ +│ UNKNOWN(0x9b) │ 1152 │ +│ COPY │ 1134 │ +│ PSEUDO_DMA_TRIGGER │ 722 │ +│ TENSOR_SCALAR_ADDR │ 619 │ +│ UNKNOWN(0xda) │ 333 │ +│ MEMSET │ 311 │ +│ TENSOR_REDUCE │ 269 │ +│ UNKNOWN(0x92) │ 264 │ +│ RECIPROCAL │ 257 │ +│ UNKNOWN(0x24) │ 256 │ +│ TENSOR_SCALAR │ 221 │ +│ UNKNOWN(0xd3) │ 145 │ +│ UNKNOWN(0xd8) │ 125 │ +│ ACT_TABLE_LOAD │ 27 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ STREAM_SHUFFLE │ 16 │ +│ LOAD_MASK_SELECT │ 16 │ +│ UNKNOWN(0xd2) │ 15 │ +│ UNKNOWN(0xd9) │ 12 │ +│ UNKNOWN(0xe8) │ 8 │ +│ UNKNOWN(0xcf) │ 7 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ MOVE │ 7 │ +│ IOTA │ 5 │ +│ ALU_OP │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 4512 │ +│ Scalar │ 7523 │ +│ Tensor │ 64470 │ +│ SyncDMA │ 0 │ +│ Vector │ 6489 │ +│ Sync │ 1065 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:39:03Z USER 9099 (nc01/sgLnk) [Codegen]: isa_gen finished after 0.338 seconds +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 596 │ +│ qDVESpillReload0_defId_2 │ 2 │ +│ qPoolSpillReload0_defId_0 │ 589824 │ +│ qPoolSpillReload0_defId_1 │ 589824 │ +│ qPoolSpillReload0_defId_2 │ 7 │ +│ qSPIO0 │ 172106 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 16396 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 1368757 (0.0203961 GB) +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qSPIO0 │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌───────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├───────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2512-0_b3_grp_29_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2512-0_grp_13_sec_0_mhlo_exponential_6_b3_i0_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2512-0_b0_grp_29_s0_tile1_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2751-0_b1_grp_17_s0_tile0_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2751-0_b3_grp_30_s0_tile1_exp_tp_sbuf_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2512-0_grp_29_sec_0_mhlo_exponential_6_b3_i1_sg0001 │ Internal │ bfloat16 │ 16 │ +│ compare.2.1705_sg0001 │ Internal │ int32 │ 27 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 297 │ +└───────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:39:03Z USER 9099 (nc01/sgLnk) [Codegen]: dma_desc_gen finished after 0.066 seconds +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:39:03Z USER 9099 (nc01/sgLnk) [Codegen]: debug_info_gen finished after 0.076 seconds +2025-11-04T21:39:03Z USER 9099 (nc01/sgLnk) [ModuleForkPass]: codegen finished after 0.505 seconds +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [ModuleForkPass]: curr_vmrss: 1252mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 15656 memory location(s), 4 block(s), and 49864 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [Codegen]: Instruction Stats: +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 32931 │ +│ LDWEIGHTS │ 32464 │ +│ EVENT_SEMAPHORE │ 3689 │ +│ ACTIVATE │ 3210 │ +│ UNKNOWN(0xd4) │ 2708 │ +│ TENSOR_TENSOR │ 2399 │ +│ CAST │ 1446 │ +│ COPY │ 1326 │ +│ UNKNOWN(0x9b) │ 1152 │ +│ UNKNOWN(0x9a) │ 1152 │ +│ PSEUDO_DMA_TRIGGER │ 792 │ +│ TENSOR_SCALAR_ADDR │ 619 │ +│ UNKNOWN(0xda) │ 333 │ +│ MEMSET │ 324 │ +│ POOL_BUFFER_LOAD │ 291 │ +│ GATHER │ 291 │ +│ TENSOR_REDUCE │ 274 │ +│ UNKNOWN(0x92) │ 264 │ +│ RECIPROCAL │ 259 │ +│ UNKNOWN(0x24) │ 256 │ +│ TENSOR_SCALAR │ 223 │ +│ UNKNOWN(0xd3) │ 145 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ UNKNOWN(0xd8) │ 125 │ +│ ACT_TABLE_LOAD │ 28 │ +│ STREAM_SHUFFLE │ 20 │ +│ LOAD_MASK_SELECT │ 20 │ +│ PSEUDO_BRANCH_LABEL │ 20 │ +│ UNKNOWN(0xd2) │ 15 │ +│ UNKNOWN(0xd9) │ 12 │ +│ UNKNOWN(0xe8) │ 8 │ +│ MOVE │ 7 │ +│ UNKNOWN(0xcf) │ 7 │ +│ PSEUDO_DMA_REARM │ 7 │ +│ IOTA │ 5 │ +│ ALU_OP │ 2 │ +│ UNKNOWN(0xe5) │ 2 │ +│ PSEUDO_TENSOR_LOAD │ 1 │ +│ RNG │ 1 │ +│ TENSOR_SCALAR │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 5313 │ +│ Scalar │ 7709 │ +│ Tensor │ 65997 │ +│ SyncDMA │ 0 │ +│ Vector │ 7150 │ +│ Sync │ 1192 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:39:03Z USER 9099 (nc00/sgLnk) [Codegen]: isa_gen finished after 0.556 seconds +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────────────┼────────────────┤ +│ qActSpillReload0_defId_2 │ 602 │ +│ qDVESpillReload0_defId_2 │ 140 │ +│ qPoolSpillReload0_defId_0 │ 589824 │ +│ qPoolSpillReload0_defId_1 │ 602112 │ +│ qPoolSpillReload0_defId_2 │ 207 │ +│ qSPIO0 │ 172110 │ +│ qSPPIOParam0 │ 56 │ +│ qSPSpillReload0_defId_0 │ 2 │ +│ qSPSpillReload0_defId_2 │ 16740 │ +└───────────────────────────┴────────────────┘ + +Total descriptors: 1381793 (0.0205903 GB) +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qSPIO0 │ 16 │ +│ qActDynamicHW │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +│ qSPPIOParam0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 144 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [Codegen]: Tensors with largest descriptor count: +┌───────────────────────────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├───────────────────────────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ I-2512-0_b2_grp_31_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2751-0_grp_29_sec_0_mhlo_exponential_6_b1_i1_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2751-0_grp_27_sec_0_mhlo_exponential_6_b3_i0_sg0000 │ Internal │ bfloat16 │ 16 │ +│ I-2512-0_grp_31_sec_0_mhlo_exponential_6_b2_i1_sg0001 │ Internal │ bfloat16 │ 16 │ +│ I-2512-0_b1_grp_16_s0_tile0_exp_tp_sbuf_sg0001 │ Internal │ bfloat16 │ 16 │ +│ compare.2.1705_sg0001 │ Internal │ int32 │ 27 │ +│ all-reduce.465.1930_sg0001 │ Internal │ bfloat16 │ 27 │ +│ add.4_sg0001 │ Internal │ bfloat16 │ 27 │ +│ input2 │ ExternalInput │ int32 │ 28 │ +│ convert.55_sg0002 │ Internal │ float32 │ 298 │ +└───────────────────────────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:39:03Z USER 9099 (nc00/sgLnk) [Codegen]: dma_desc_gen finished after 0.083 seconds +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [Codegen]: Generating debug info +2025-11-04T21:39:03Z USER 9099 (nc00/sgLnk) [Codegen]: debug_info_gen finished after 0.078 seconds +2025-11-04T21:39:03Z USER 9099 (nc00/sgLnk) [ModuleForkPass]: codegen finished after 0.741 seconds +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [ModuleForkPass]: curr_vmrss: 990mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [ModuleForkPass]: Output has 1 module(s), 4 function(s), 16132 memory location(s), 4 block(s), and 51577 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:03Z USER 9099 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:39:03Z USER 9099 [BackendPassManager]: mod_parallel_pass finished after 0.747 seconds +2025-11-04T21:39:03Z INFO 9099 [BackendPassManager]: curr_vmrss: 990mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:03Z USER 9099 [BackendPassManager]: Running hbm_usage +2025-11-04T21:39:03Z INFO 9099 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 1.219KB │ 181.250KB │ +│ CCE │ 2.625MB │ 192.047KB │ +│ Transpose │ 0.000B │ 18.000MB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 16.750KB │ 200.750KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc00/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.840GB │ +│ Model Code │ 5.332MB │ +│ Model Constants │ 721.012KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 175.508MB │ +│ DMA Ring IO │ 2.643MB │ +│ DMA Ring Spill │ 18.561MB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 1.156KB │ 169.625KB │ +│ CCE │ 2.625MB │ 48.000B │ +│ Transpose │ 0.000B │ 18.000MB │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 16.250KB │ 177.250KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:39:03Z INFO 9099 (nc01/sgLnk) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.775GB │ +│ Model Code │ 5.131MB │ +│ Model Constants │ 719.004KB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 109.500MB │ +│ DMA Ring IO │ 2.642MB │ +│ DMA Ring Spill │ 18.339MB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:39:03Z INFO 9099 [HBMUsage]: Total estimated HBM usage is: 3.973GB +2025-11-04T21:39:03Z USER 9099 [BackendPassManager]: hbm_usage finished after 0.011 seconds +2025-11-04T21:39:03Z INFO 9099 [BackendPassManager]: curr_vmrss: 990mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:03Z INFO 9099 [BackendPassManager]: Output has 2 module(s), 8 function(s), 31788 memory location(s), 8 block(s), and 101441 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:03Z USER 9099 [BackendPassManager]: Running neff_packager +2025-11-04T21:39:03Z INFO 9099 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=8 allocs=31788 blocks=8 instructions=101441 Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1731_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1648-1733_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1659-1735_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1965_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1952_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1469-1584_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1480-1586_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1731_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.24_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.25_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.27-1139-1362_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1553_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.7_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.9-1731_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.3-1648-1733_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_constant.2-1659-1735_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1965_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0000_identity_1952_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.15_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.12-1469-1584_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_constant.11-1480-1586_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0001_identity_1731_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.26_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.28_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_constant.29_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: FileDeDuper file not found value_sg0002_identity_1553_CRSM.npy +2025-11-04T21:39:03Z INFO 9099 [NeffPackager]: Const File de-dup saved 0 KB of memory footprint +2025-11-04T21:39:03Z WARNING 9099 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l/metrics.json +2025-11-04T21:39:03Z WARNING 9099 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:39:03Z INFO 9099 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff +2025-11-04T21:39:03Z INFO 9099 [NeffFileWriter]: IR signature: c542dc58eda7d64aa0ce20926682f6ed for neff artifacts +2025-11-04T21:39:03Z USER 9099 [BackendPassManager]: neff_packager finished after 0.171 seconds +2025-11-04T21:39:03Z INFO 9099 [BackendPassManager]: curr_vmrss: 990mb, ru_maxrss: 1372mb (delta=0mb) +2025-11-04T21:39:03Z INFO 9099 [BackendPassManager]: Output has 2 module(s), 8 function(s), 31788 memory location(s), 8 block(s), and 101441 instruction(s). Max writers: 299 Max Readers: 5818 +2025-11-04T21:39:03Z INFO 9099 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.007324 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.093262 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.007324 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.085938 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local │ 0.013184 GB │ +│ nc00 │ sg01 │ Peak scratchpad usage: local and shared │ 0.106934 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: local │ 0.015137 GB │ +│ nc00 │ sg01 │ Total size of allocated tensors: shared │ 0.109375 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local │ 0.013672 GB │ +│ nc00 │ sg02 │ Peak scratchpad usage: local and shared │ 0.060856 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: local │ 0.013699 GB │ +│ nc00 │ sg02 │ Total size of allocated tensors: shared │ 0.062836 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.013672 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.106934 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.171398 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.877060 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.007324 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.007324 GB │ +│ nc01 │ sg01 │ Peak scratchpad usage: local │ 0.013672 GB │ +│ nc01 │ sg01 │ Total size of allocated tensors: local │ 0.016479 GB │ +│ nc01 │ sg02 │ Peak scratchpad usage: local │ 0.013672 GB │ +│ nc01 │ sg02 │ Total size of allocated tensors: local │ 0.013672 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.013672 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.171398 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:39:03Z INFO 9099 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=local (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ _spill_1931 │ bfloat16 │ 13 │ 6.500000 MB │ +│ transpose.9_pftranspose_1111 │ bfloat16 │ 2 │ 1.000000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:39:03Z INFO 9099 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬──────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼──────────────┤ +│ dot.4 │ bfloat16 │ 2 │ 16.000000 MB │ +│ get_tuple_element.1 │ bfloat16 │ 1 │ 8.000000 MB │ +│ reshape.16 │ bfloat16 │ 1 │ 8.000000 MB │ +│ reshape.24 │ bfloat16 │ 1 │ 8.000000 MB │ +│ reshape.29 │ bfloat16 │ 1 │ 8.000000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴──────────────┘ + +2025-11-04T21:39:03Z INFO 9099 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc00 (complete data located at nc00//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬──────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼──────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴──────────────┘ + +2025-11-04T21:39:03Z INFO 9099 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg00, addr_space=local (complete data located at nc01/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ _spill_1931 │ bfloat16 │ 13 │ 6.500000 MB │ +│ transpose.9_pftranspose_1111 │ bfloat16 │ 2 │ 1.000000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:39:03Z INFO 9099 [BackendDriver]: Largest intermediate tensors at peak scratchpad usage, core=nc01 (complete data located at nc01//sgLnk/sg00/memory_analysis_after_coloring_allocator_dram_post_lnk_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬──────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼──────────────┤ +│ intermediate0 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate3 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate5 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate6 │ bfloat16 │ 1 │ 16.000000 MB │ +│ intermediate1 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate2 │ bfloat16 │ 1 │ 1.000000 MB │ +│ intermediate4 │ bfloat16 │ 1 │ 0.003906 MB │ +│ intermediate7 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴──────────────┘ + +2025-11-04T21:39:03Z INFO 9099 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:39:04Z INFO 8756 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:39:04Z INFO 8756 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "walrus_bir.out.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "cached_wavegraph": "walrus_bir.out.json", "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:39:04Z INFO 8756 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l +2025-11-04T21:39:04Z INFO 8756 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:39:04Z INFO 8756 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:39:04Z INFO 8756 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:39:04Z INFO 8756 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:39:04Z INFO 8756 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:39:04Z INFO 8756 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/context_encoding_model/_tp0_bk5/neuronxcc-uoudd35l/hlo_netlist.json +2025-11-04T21:39:04Z INFO 8756 [job.NeffWrapper.0]: There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:39:04Z INFO 8756 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:39:04Z INFO 8756 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:39:04Z INFO 8743 [root]: Subcommand returned with exitcode=0 diff --git a/context_encoding_model/_tp0_bk5/metaneff.pb b/context_encoding_model/_tp0_bk5/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..7f1cc3c27df3dd836b5022326be32163b236d691 --- /dev/null +++ b/context_encoding_model/_tp0_bk5/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c450bc5d4e02b721362ee602b13588691279903456d446e0b1d4f048c89716a5 +size 5020653 diff --git a/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb b/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..ea52206082d734490a50b178b72d170fa03d5d4e --- /dev/null +++ b/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa2ec0d3433966689dceea7b24e7287ac4c7bb36edc7707dd0d953638b51f72 +size 5107439 diff --git a/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff b/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff new file mode 100644 index 0000000000000000000000000000000000000000..829fed0d85d00a9196fb2d504e59a821afc81b1a --- /dev/null +++ b/context_encoding_model/_tp0_bk5/model.MODULE_96a8f4e12dc810958634+b1e26cef.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56e28f3613a7ada8c1d580c4a0d3979da6436bd82072a724c52018668343c286 +size 3062784 diff --git a/context_encoding_model/_tp0_bk5/neuron_config.json b/context_encoding_model/_tp0_bk5/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8b257e62415bbe6fe24c74c56633679b7f3b160b --- /dev/null +++ b/context_encoding_model/_tp0_bk5/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 1, + "bucket_n_active_tokens": true, + "buckets": [ + 4096 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": [ + 4096 + ], + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": true, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 4096, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/layout_opt/command.txt b/layout_opt/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..31e96bc6a6c9f0c96a90bfe63e2fb94dab798aad --- /dev/null +++ b/layout_opt/command.txt @@ -0,0 +1 @@ +neuronx-cc compile graph.hlo --framework XLA --target trn2 --output graph.neff --model-type=transformer -O1 --lnc=2 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=true' --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/layout_opt/graph.neff b/layout_opt/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..3dac5314d392be1c66ca574d08bd32ed1ac2a62c --- /dev/null +++ b/layout_opt/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd8dc76df642bc7586f583a6602017fdebbaf6b3a6fdb6b43ab5164643c0af63 +size 1649664 diff --git a/layout_opt/log-neuron-cc.txt b/layout_opt/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1a956f5a8a6b788b5b30a5efa145a90dd28c806 --- /dev/null +++ b/layout_opt/log-neuron-cc.txt @@ -0,0 +1,3306 @@ +2025-11-04T21:41:35Z INFO 9914 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/model/graph.hlo --framework XLA --target trn2 --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/graph.neff --model-type=transformer -O1 --lnc=2 '--internal-hlo2tensorizer-options=--experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=true' --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/log-neuron-cc.txt --verbose=35 +2025-11-04T21:41:35Z INFO 9914 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:41:35Z INFO 9926 [root]: XLA detected +2025-11-04T21:41:35Z INFO 9926 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:41:36Z INFO 9926 [root]: Intermediate files stored in /home/ubuntu/neuronxcc-799a9j1_, output in /home/ubuntu +2025-11-04T21:41:36Z INFO 9926 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:41:36Z INFO 9926 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:41:36Z INFO 9926 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:41:36Z INFO 9926 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:41:36Z INFO 9926 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:41:36Z INFO 9926 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:41:36Z INFO 9926 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/model/graph.hlo --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --partition --emit-tensor-level-dropout-ops --experimental-unsafe-fp8e4m3fn-as-fp8e4m3 --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:41:36Z INFO 9926 [job.HLOToTensorizer.0]: Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: parameter reshape transpose tuple +2025-11-04 21:41:36.121096: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:88] Could not open file debug_info_hlo_partitions.json +2025-11-04 21:41:36.121294: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %last = tuple(%p60, %reshape.619, %reshape.621, %reshape.623, %reshape.625, %reshape.627, %reshape.629, %reshape.631, %reshape.633, %reshape.635, %reshape.637, %reshape.639, %reshape.641, %reshape.643, %reshape.645, %reshape.647, %reshape.649, %reshape.651, %reshape.653, %reshape.655, %reshape.657, %reshape.659, %reshape.661, %reshape.663, %reshape.665, %reshape.667, %reshape.669, %reshape.671, %reshape.673, %reshape.675, %reshape.677, %reshape.679, %reshape.681, %reshape.683, %reshape.685, %reshape.687,... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:41:36Z INFO 9926 [job.HLOToTensorizer.0]: IR signature: 2cdee81f1f2958c7a4eb33803421cc426699c5f9c5f2a8b512dc81d0bb27b479 for sg0000/HLOToTensorizer +2025-11-04T21:41:36Z INFO 9926 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:41:36Z INFO 9926 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:41:36Z INFO 9926 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:41:36Z INFO 9926 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:41:36Z INFO 9926 [job.Frontend.0]: Processing input #0 +2025-11-04T21:41:36Z INFO 9926 [job.Frontend.0]: Start model loading +2025-11-04T21:41:36Z INFO 9926 [job.Frontend.0]: Start tensorization +2025-11-04T21:41:36Z INFO 9926 [job.Frontend.0]: Num jobs: 12 +2025-11-04T21:41:36Z USER 9926 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:41:36Z INFO 9926 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-11-04T21:41:36Z INFO 9926 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-11-04T21:41:36Z INFO 9926 [Tensorizer]: Tensorizer options: --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=matmult-bf16 --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.004 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.009 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.028 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.016 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.005 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.061 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.073 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.005 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.039 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.006 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.004 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.010 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.010 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.003 seconds +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:41:36Z INFO 9926 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.060 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.028 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.089 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.004 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.000 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.005 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.020 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Rematerialization]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.003 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.000 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [Tensorizer]: After optimization: 309 statements +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AutoCastTCInputs]: Running AutoCastTCInputs +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AutoCastTCInputs]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AutoCastTCInputs]: AutoCastTCInputs finished after 0.003 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.003 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.006 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.004 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LICM]: LICM finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.006 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.019 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.004 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.036 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 0.016 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 0.047 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.004 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 0.067 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.009 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.014 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 309 +total number of sharded dags: 0 + +total bytes transferred from input, output, non local tensors: 0 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 0 +% bytes transferred with 2x bandwidths: 0.00 + +NC0 FLOPs: 0 +NC1 FLOPs: 0 +% FLOPs sharded: 0.00 + + + +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.010 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 0.066 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 0.051 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.003 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.005 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.003 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 0.017 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 0.119 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.002 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.001 seconds +2025-11-04T21:41:37Z INFO 9926 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 27.340 seconds +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 27.756 seconds +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.188 seconds +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.177 seconds +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:42:05Z INFO 9926 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 0.637 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.025 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 0.663 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.118 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.119 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/LICM]: LICM finished after 0.032 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.022 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.082 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.070 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 0.271 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: transpose_128x128 +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.041 seconds +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:42:06Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.129 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.129 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.077 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.068 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.134 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.134 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.022 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.090 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.021 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.007 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.058 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=False) +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.075 seconds +2025-11-04T21:42:07Z INFO 9926 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 0.377 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.135 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.136 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.024 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.078 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/LICM]: LICM finished after 0.034 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 0.394 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=False) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.001 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.035 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.046 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.028 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.022 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.016 seconds +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:42:08Z INFO 9926 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.127 seconds +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.047 seconds +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 0.174 seconds +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.019 seconds +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.068 seconds +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.115 seconds +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:42:09Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 1.682 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.032 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.717 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.035 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.022 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.047 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.056 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.024 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.024 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.012 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.011 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.001 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.001 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.025 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.009 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.010 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.011 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.089 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.090 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.006 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.161 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.161 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.043 seconds +2025-11-04T21:42:11Z INFO 9926 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.500 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.018 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.041 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.043 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.015 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.168 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.018 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.015 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.062 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.058 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.169 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.169 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.016 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.140 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.028 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.018 seconds +2025-11-04T21:42:12Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.017 seconds +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.029 seconds +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.010 seconds +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.064 seconds +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.015 seconds +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.040 seconds +2025-11-04T21:42:13Z INFO 9926 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:42:14Z INFO 9926 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 1.224 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.104 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 2.414 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.071 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.018 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.017 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.079 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30332.36717'[i0_5574_0_32416,i0_5574_1_32416,T_i1_39174,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %2838[i0_5574_0_32416,i0_5574_1_32416,T_i1_39174,i0.128,i3.2,i2.16,i1.128] # id=34997, src_id=None, , instances=16 # dl = tensor_op_name: t2838_pftranspose_30332 | hlo_id: 1939 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30387.36727'[i0_6479_0_32431,i0_6479_1_32431,T_i1_39175,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %2915[i0_6479_0_32431,i0_6479_1_32431,T_i1_39175,i0.128,i3.2,i2.16,i1.128] # id=35059, src_id=None, , instances=16 # dl = tensor_op_name: t2915_pftranspose_30387 | hlo_id: 1972 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30442.36737'[i0_7384_0_32446,i0_7384_1_32446,T_i1_39176,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %2992[i0_7384_0_32446,i0_7384_1_32446,T_i1_39176,i0.128,i3.2,i2.16,i1.128] # id=35121, src_id=None, , instances=16 # dl = tensor_op_name: t2992_pftranspose_30442 | hlo_id: 2005 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30497.36747'[i0_8289_0_32461,i0_8289_1_32461,T_i1_39177,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %3069[i0_8289_0_32461,i0_8289_1_32461,T_i1_39177,i0.128,i3.2,i2.16,i1.128] # id=35183, src_id=None, , instances=16 # dl = tensor_op_name: t3069_pftranspose_30497 | hlo_id: 2038 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30552.36757'[i0_9194_0_32476,i0_9194_1_32476,T_i1_39178,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %3146[i0_9194_0_32476,i0_9194_1_32476,T_i1_39178,i0.128,i3.2,i2.16,i1.128] # id=35245, src_id=None, , instances=16 # dl = tensor_op_name: t3146_pftranspose_30552 | hlo_id: 2071 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30607.36767'[i0_10099_0_32491,i0_10099_1_32491,T_i1_39179,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %3223[i0_10099_0_32491,i0_10099_1_32491,T_i1_39179,i0.128,i3.2,i2.16,i1.128] # id=35307, src_id=None, , instances=16 # dl = tensor_op_name: t3223_pftranspose_30607 | hlo_id: 2104 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30662.36777'[i0_11004_0_32506,i0_11004_1_32506,T_i1_39180,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %3300[i0_11004_0_32506,i0_11004_1_32506,T_i1_39180,i0.128,i3.2,i2.16,i1.128] # id=35369, src_id=None, , instances=16 # dl = tensor_op_name: t3300_pftranspose_30662 | hlo_id: 2137 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30717.36787'[i0_11909_0_32521,i0_11909_1_32521,T_i1_39181,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %3377[i0_11909_0_32521,i0_11909_1_32521,T_i1_39181,i0.128,i3.2,i2.16,i1.128] # id=35431, src_id=None, , instances=16 # dl = tensor_op_name: t3377_pftranspose_30717 | hlo_id: 2170 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30772.36797'[i0_12814_0_32536,i0_12814_1_32536,T_i1_39182,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %3454[i0_12814_0_32536,i0_12814_1_32536,T_i1_39182,i0.128,i3.2,i2.16,i1.128] # id=35493, src_id=None, , instances=16 # dl = tensor_op_name: t3454_pftranspose_30772 | hlo_id: 2203 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 74.128us (16.000MiB, est bw: 226.329GB/s, 0.643% of tot. time) for bfloat16<128 x 4096> TongaSB partitions[3] bfloat16 (2, 4, 2, 128, 2, 16, 128) %'30827.36807'[i0_13719_0_32551,i0_13719_1_32551,T_i1_39183,i0.128,i3.2,i2.16,i1.128] = load bfloat16<128 x 4096> non_local bfloat16 (2, 4, 2, 128, 2, 12, 128) %3531[i0_13719_0_32551,i0_13719_1_32551,T_i1_39183,i0.128,i3.2,i2.16,i1.128] # id=35555, src_id=None, , instances=16 # dl = tensor_op_name: t3531_pftranspose_30827 | hlo_id: 2236 | if -i2.16+11 >= 0 [[i0.128];[i1.128, i2.16, i3.2]] -> [[i0.128];[i1.128, i2.16, i3.2]] +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.029 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=False) +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 0.017 seconds +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:42:15Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.297 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.298 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:42:16Z WARNING 9926 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 100.00 percent of all matmul computation +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.153 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.262 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.043 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.059 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.068 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.200 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.200 seconds +2025-11-04T21:42:16Z INFO 9926 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.038 seconds +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=False) +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.022 seconds +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.451 seconds +2025-11-04T21:42:17Z INFO 9926 [Tensorizer]: BirCodeGen estimate #instances=29631 in sg0000 +2025-11-04T21:42:17Z INFO 9926 [Tensorizer]: IR signature: 966da3580f3c15f8dba253a6a7bd2b32973dc615f73a022b144fc986488d8019 for nc00/sg0000/TensorizerBIR +2025-11-04T21:42:17Z INFO 9926 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:42:18Z INFO 9926 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:42:18Z INFO 9926 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 0.426 seconds +2025-11-04T21:42:18Z INFO 9926 [Tensorizer]: BirCodeGen estimate #instances=29631 in sg0000 +2025-11-04T21:42:18Z INFO 9926 [Tensorizer]: IR signature: 318e912cf7ba246c5ceca1774b19b1d754a906bffc531bcfd54116d80a508e39 for nc01/sg0000/TensorizerBIR +2025-11-04T21:42:18Z INFO 9926 [Tensorizer]: Weights total number of bytes: 32768 +2025-11-04T21:42:18Z INFO 9926 [Tensorizer]: Successfully built model. +2025-11-04T21:42:18Z USER 9926 [root/Tensorizer/Tensorizer]: Tensorizer finished after 42.223 seconds +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: End tensorization +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input0 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input1 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input2 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input3 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input4 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input5 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input6 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input7 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input8 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input9 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input10 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input11 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input12 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input13 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input14 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input15 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input16 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input17 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input18 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input19 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input20 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input21 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input22 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input23 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input24 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input25 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input26 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input27 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input28 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input29 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input30 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input31 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input32 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input33 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input34 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input35 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input36 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input37 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input38 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input39 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input40 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input41 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input42 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input43 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input44 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input45 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input46 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input47 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input48 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input49 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input50 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input51 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input52 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input53 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input54 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input55 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input56 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input57 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input58 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input59 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input60 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input61 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input62 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input63 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input64 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input65 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input66 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input67 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input68 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input69 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input70 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input71 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input72 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input73 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input74 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input75 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input76 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input77 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input78 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input79 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input80 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input81 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input82 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input83 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input84 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input85 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input86 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input87 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input88 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input89 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input90 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input91 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input92 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input93 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input94 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input95 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input96 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input97 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input98 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input99 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input100 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input101 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input102 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input103 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input104 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input105 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input106 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input107 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input108 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input109 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input110 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input111 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input112 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input113 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input114 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input115 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input116 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input117 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input118 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input119 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input120 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input121 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input122 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input123 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input124 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input125 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input126 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input127 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input128 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input129 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input130 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input131 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input132 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input133 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input134 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input135 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input136 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input137 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input138 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input139 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input140 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input141 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input142 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input143 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input144 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input145 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input146 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input147 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input148 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input149 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input150 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input151 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input152 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input153 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input154 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input155 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input156 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input157 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input158 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input159 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input160 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input161 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input162 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input163 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input164 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input165 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input166 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input167 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input168 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input169 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input170 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input171 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input172 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input173 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input174 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input175 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input176 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input177 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input178 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input179 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input180 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input181 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input182 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input183 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input184 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input185 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input186 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input187 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input188 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input189 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input190 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input191 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input192 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input193 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input194 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input195 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input196 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input197 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input198 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input199 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input200 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input201 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input202 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input203 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input204 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input205 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input206 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input207 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input208 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input209 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input210 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input211 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input212 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input213 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input214 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input215 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input216 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input217 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input218 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input219 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input220 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input221 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input222 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input223 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input224 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input225 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input226 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input227 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input228 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input229 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input230 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input231 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input232 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input233 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input234 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input235 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input236 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input237 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input238 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input239 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input240 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input241 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input242 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input243 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input244 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input245 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input246 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input247 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input248 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input249 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input250 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input251 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input252 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input253 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input254 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input255 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input256 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input257 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input258 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input259 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input260 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input261 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input262 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input263 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input264 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input265 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input266 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input267 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input268 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input269 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input270 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input271 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input272 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input273 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input274 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input275 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input276 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input277 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input278 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input279 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input280 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input281 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input282 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input283 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input284 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input285 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input286 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input287 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input288 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input289 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input290 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input291 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input292 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input293 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input294 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input295 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input296 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input297 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input298 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input299 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input300 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input301 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input302 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input303 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input304 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input305 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input306 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input307 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input308 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input309 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Network input: input310 +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: wrote bir.json +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: wrote bir.json +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:42:18Z INFO 9926 [job.Frontend.0]: Job #0 finished +2025-11-04T21:42:18Z INFO 9926 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:42:18Z INFO 9926 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:42:18Z INFO 9926 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:42:18Z INFO 9926 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: BackendDriver has 2 states with 2 core LNC +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/neuronxcc-799a9j1_ +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: BackendDriver: no partitions within VNC found. Switching to VNC + flat flow. +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: BackendDriver in_state.num_states 2 with 2 core LNC +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/log-neuron-cc.txt --vnc-nc-per-sengine 2 --link-subgraphs nc00/sg00,nc01/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --unified-backend-and-legacy-codegen --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels spill_reload,scalar_dynamic_offset,io,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/graph.neff +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: Working directory is /home/ubuntu/neuronxcc-799a9j1_ +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:42:18Z INFO 9926 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:42:18Z INFO 10021 [Logging]: Logging to ../qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/log-neuron-cc.txt at level 'INFO' +2025-11-04T21:42:18Z INFO 10021 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:42:18Z INFO 10021 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:42:18Z INFO 10021 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:42:18Z INFO 10021 [BackendDriver]: Backend driver mtBackend: false numModules: 2 Cwd: "/home/ubuntu/neuronxcc-799a9j1_" +2025-11-04T21:42:18Z INFO 10021 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:42:18Z INFO 10021 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:42:18Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:18Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=3046 blocks=2 instructions=1916 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:18Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:42:18Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:42:18Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:18Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:18Z USER 10021 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.000 seconds +2025-11-04T21:42:18Z USER 10021 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:42:18Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 134mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:18Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 134mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:18Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1523 memory location(s), 1 block(s), and 958 instruction(s). Max writers: 2 Max Readers: 253 +2025-11-04T21:42:18Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:18Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1523 memory location(s), 1 block(s), and 958 instruction(s). Max writers: 2 Max Readers: 253 +2025-11-04T21:42:18Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:18Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:18Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.063 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 248mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.063 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 248mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1523 memory location(s), 1 block(s), and 958 instruction(s). Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1523 memory location(s), 1 block(s), and 958 instruction(s). Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.067 seconds +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: curr_vmrss: 248mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=3046 blocks=2 instructions=1916 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=3046 blocks=2 instructions=1916 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.001 seconds +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 249mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 3046 memory location(s), 2 block(s), and 1916 instruction(s). Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: subgraph_parallel_pass finished after 0.002 seconds +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: curr_vmrss: 249mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=3046 blocks=2 instructions=1916 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 249mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 249mb, ru_maxrss: 400mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1523 memory location(s), 1 block(s), and 958 instruction(s). Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 1523 memory location(s), 1 block(s), and 958 instruction(s). Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=1523 blocks=1 instructions=958 Max writers: 2 Max Readers: 253 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:42:19 2025 + +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:42:19 2025 + +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: Total count: 26634 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: Matmult: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: GenericCopy: 3025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: Load: 1010 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: Save: 841 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: CoreBarrier: 169 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: DMACopy: 28 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 0 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.366 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 585mb, ru_maxrss: 585mb (delta=185mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: Total count: 26521 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: Matmult: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: GenericCopy: 3025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: Load: 1010 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: Save: 728 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: CoreBarrier: 169 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: DMACopy: 28 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 0 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.366 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 582mb, ru_maxrss: 585mb (delta=185mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11161 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11161 memory location(s), 1 block(s), and 26521 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=11161 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=11161 blocks=1 instructions=26521 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.053 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 419mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.054 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 419mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.433 seconds +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: curr_vmrss: 419mb, ru_maxrss: 585mb (delta=185mb) +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=10659 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=10659 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 419mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10659 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: curr_vmrss: 419mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10659 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.044 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.045 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.046 seconds +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=10659 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=10659 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.006 seconds +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10659 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: subgraph_parallel_pass finished after 0.007 seconds +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:19Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10659 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.005 seconds +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.005 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.002 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.002 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 428mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.016 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.017 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z WARNING 10021 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z WARNING 10021 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 430mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 224 nodes. Total savings 229376 bytes/partition +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 224 nodes. Total savings 229376 bytes/partition +2025-11-04T21:42:19Z INFO 10021 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:42:19Z INFO 10021 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.008 seconds +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.019 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 431mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:42:19Z INFO 10021 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.004 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.009 seconds +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.020 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 431mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.084 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 431mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.085 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 431mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.005 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 431mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.005 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 432mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.012 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 432mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.012 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 432mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.023 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 433mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 433mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.024 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 433mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 433mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 433mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 433mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: Found 0 Splits CCs +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: Grouped CCs to 0 clusters. +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: Found 0 Splits CCs +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: Grouped CCs to 0 clusters. +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: To Spill 0 multi-layer tensors +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:42:19Z INFO 10021 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Num_Splits: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Num_Splits: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [build_flow_deps]: Allocs: 5188 instructions: 26238 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [build_flow_deps]: Allocs: 5471 instructions: 26634 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 68712 edges +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [build_flow_deps]: Done build fdeps 68712 Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 69391 edges +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [build_flow_deps]: Done build fdeps 69391 Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.173 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:42:19 2025 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.174 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.052 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5188 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=5188 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5189 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=5189 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.052 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5471 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=5471 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5472 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=5472 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.000 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.003 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 452mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: size = 2912 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: size = 2969 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: found 9744 edges +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: mean: 6.69231 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: median: 7 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 77952 bytes +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: lo = 2912 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: total = 2912 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: found 9744 edges +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: mean: 6.56383 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: median: 7 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 77952 bytes +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.051 seconds +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 453mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: lo = 2969 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: hi = 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: total = 2969 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:42:19Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.051 seconds +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 453mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:19Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:42:19Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.032 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 453mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.034 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 453mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.121 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 454mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.114 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 454mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 704675840 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 6137 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 704643072 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 7561 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 704923648 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 6088 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 704890880 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 7011 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: size = 1627 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: size = 1853 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: found 2912 accumulation groups +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: largest = 31831.36698_i46 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: found 2969 accumulation groups +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: largest = 31831.36698_i22 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:42:20Z INFO 10021 []: find first defs for local +2025-11-04T21:42:20Z INFO 10021 []: find first defs for local +2025-11-04T21:42:20Z INFO 10021 []: find first defs for global +2025-11-04T21:42:20Z INFO 10021 []: find first defs for global +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: 672 remat count +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: 672 remat count +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Num intervals 1627 Num locations 1627 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Num intervals 1853 Num locations 1853 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: edge: 7764 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: mean: 8.37992 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: median: 4.65964 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: edge: 7029 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: mean: 8.64044 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: median: 5.00561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: safe = 1850 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: unsafe = 1 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: total = 1851 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1853 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: safe = 1624 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: unsafe = 1 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: inf = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: total = 1625 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 1627 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Total: 1625 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (1625) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Rover zone: 0.948 (1540) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.052 (85) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Blocks tall: 1.000 (1625) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 1.000 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Total: 1851 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (1851) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Rover zone: 0.924 (1710) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.076 (141) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.030 (56) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.000 (0) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.970 (1795) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.923 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 1.000 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 704675840 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 6137 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 704643072 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 7561 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 704923648 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 6088 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 704890880 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 7011 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 0 bytes +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.129 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.129 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 465mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.034 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 466mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.034 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 466mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1409318912, 37.5015% input load, 49.9988% output write, 12.4997% spill/reload [sg0000] +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1409814528, 37.5058% input load, 49.9988% output write, 12.4953% spill/reload [sg0000] +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(5.28515e+08) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(5.28763e+08) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 6137 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 7561 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 6088 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 704675840 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 6137 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 7011 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 704643072 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 7561 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 704923648 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 6088 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 704890880 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 7011 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1409318912, 37.5015% input load, 49.9988% output write, 12.4997% spill/reload [sg0000] +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 704675840 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 6137 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 704643072 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 7561 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1409814528, 37.5058% input load, 49.9988% output write, 12.4953% spill/reload [sg0000] +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 6774 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 704923648 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 6088 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.255 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 469mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 704890880 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 7011 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 6516 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.260 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 469mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 557 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 662 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 510 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.301 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 471mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.296 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 471mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:42:20Z INFO 10021 []: find first defs for local +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: spill space = 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 0 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: size = 0 +2025-11-04T21:42:20Z INFO 10021 []: find first defs for local +2025-11-04T21:42:20Z INFO 10021 []: find first defs for global +2025-11-04T21:42:20Z INFO 10021 []: find first defs for global +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.046 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: Num intervals 0 Num locations 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: lo = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: total = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.049 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 472mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.028 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.029 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 1344 out of 2912 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.009 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 1401 out of 3025 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.009 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.009 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 26238, number of allocs: 5190 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.004165 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.003 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.009 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 26634, number of allocs: 5473 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.004687 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.003 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.002 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.002 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.003 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.016 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.016 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.004 seconds +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:20Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.005 seconds +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:20Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:20Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.038 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.036 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.005 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.004 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 473mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [build_flow_deps]: Allocs: 5190 instructions: 26238 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [build_flow_deps]: Allocs: 5473 instructions: 26634 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 69391 edges +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 68712 edges +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [build_flow_deps]: Done build fdeps 69391 Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [build_flow_deps]: Done build fdeps 68712 Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.068 seconds +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.070 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 483mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 483mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.011 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 483mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.011 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 483mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.089 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 516mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.091 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 516mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.040 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.042 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.017 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.018 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 1.742 seconds +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.001 seconds +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.011 seconds +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.046 seconds +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: subgraph_parallel_pass finished after 0.062 seconds +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: spill space = 352321536 bytes +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 352321536 bytes +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.020 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: reserved space = 32768 bytes +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: spill space = 352321536 bytes +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 352321536 bytes +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: size = 28 +2025-11-04T21:42:21Z INFO 10021 []: find first defs for local +2025-11-04T21:42:21Z INFO 10021 []: find first defs for global +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: Num intervals 28 Num locations 28 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: lo = 28 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: total = 28 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 12582912 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.050 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.051 seconds +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.001 seconds +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.014 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.017 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.018 seconds +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:42:21Z USER 10021 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:42:21Z INFO 10021 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.064 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z USER 10021 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.064 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: nc_parallel_pass finished after 0.066 seconds +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:21Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.000 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 517mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z INFO 10021 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:42:21Z INFO 10021 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:42:21Z INFO 10021 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Time-aware simulation time: 5235313 +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Time-aware simulation time: 5304261 +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: post_sched finished after 0.512 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 574mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.003 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z INFO 10021 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:42:21 2025 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: post_sched finished after 0.539 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.024 seconds +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 520mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.003 seconds +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 520mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:21Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:42:21Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.024 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 520mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.570 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 520mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:42:22Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.002 seconds +2025-11-04T21:42:22Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 520mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: subgraph_parallel_pass finished after 0.003 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 520mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 246 PSUM Banks +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1004 PSUM Banks +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 0 PSUM Banks +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 990 PSUM Banks +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 1466 PSUM Banks +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 194 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 271 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 468 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: moved 0 MM forward +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 26 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: moved 0 MM forward +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.461 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.519 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.076 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 527mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.025 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 534mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:42:22 2025 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [build_flow_deps]: Allocs: 5190 instructions: 26238 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.102 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 543mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 67200 edges +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [build_flow_deps]: Done build fdeps 67200 Tue Nov 4 21:42:22 2025 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.022 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 523mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:42:22 2025 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [build_flow_deps]: Allocs: 5473 instructions: 26634 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.101 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 28 │ 352321536 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 672 │ 528482304 │ +│ Load │ Internal │ 224 │ 176160768 │ +│ Save │ Internal -> ExternalOutput │ 728 │ 704643072 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 256 │ 1 │ +│ 2048 │ 224 │ +│ 6144 │ 448 │ +│ 8192 │ 952 │ +│ 6291456 │ 56 │ +└─────────────────────┴───────┘ + +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 21504 #MatMult-Transposes 21504 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ReportStats]: IO Tensor size combined: 4063727616 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output0 │ ExternalOutput │ bfloat16 │ 311164928 │ +│ input0 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output309 │ ExternalOutput │ bfloat16 │ 311164928 │ +│ input309 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input22 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input30 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input20 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input11 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input33 │ ExternalInput │ bfloat16 │ 12582912 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────┼──────────┼──────────┼──────────────┤ +│ _transpose.349-t3069 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.327-t2915 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.360-t3146 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.393-t3377 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.371-t3223 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.404-t3454 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.415-t3531 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.382-t3300 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.338-t2992 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.316-t2838 │ Internal │ bfloat16 │ 12582912 │ +└──────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.009 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 68433 edges +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [build_flow_deps]: Done build fdeps 68433 Tue Nov 4 21:42:22 2025 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.082 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌─────────────┬────────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├─────────────┼────────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 28 │ 352321536 │ +│ Load │ Const -> Internal │ 1 │ 32768 │ +│ Load │ ExternalInput -> Internal │ 785 │ 528730112 │ +│ Load │ Internal │ 224 │ 176160768 │ +│ Save │ Internal -> ExternalOutput │ 841 │ 704890880 │ +└─────────────┴────────────────────────────┴───────┴───────────┘ + +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 32 │ 57 │ +│ 256 │ 170 │ +│ 2048 │ 224 │ +│ 6144 │ 448 │ +│ 8192 │ 952 │ +│ 6291456 │ 56 │ +└─────────────────────┴───────┘ + +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 21561 #MatMult-Transposes 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ReportStats]: IO Tensor size combined: 4063727616 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬────────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼────────────────┼──────────┼──────────────┤ +│ output0 │ ExternalOutput │ bfloat16 │ 311164928 │ +│ input0 │ ExternalInput │ bfloat16 │ 311164928 │ +│ output309 │ ExternalOutput │ bfloat16 │ 311164928 │ +│ input309 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input22 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input30 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input20 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input11 │ ExternalInput │ bfloat16 │ 12582912 │ +│ input33 │ ExternalInput │ bfloat16 │ 12582912 │ +└────────────────────┴────────────────┴──────────┴──────────────┘ + +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌──────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├──────────────────────┼──────────┼──────────┼──────────────┤ +│ _transpose.349-t3069 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.327-t2915 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.360-t3146 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.393-t3377 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.371-t3223 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.404-t3454 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.415-t3531 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.382-t3300 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.338-t2992 │ Internal │ bfloat16 │ 12582912 │ +│ _transpose.316-t2838 │ Internal │ bfloat16 │ 12582912 │ +└──────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.007 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.738 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 524mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to assign_trigger_engine: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 0 DMA instructions. Moved 0 DMA instructions to CC's engines. +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 0 DMA instructions. Moved 0 DMA instructions to CC's engines. +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: assign_trigger_engine finished after 0.018 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 525mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.005 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 525mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.005 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 525mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.007 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 525mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: assign_hwdge_engine finished after 0.006 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 525mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 52872 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌──────────────┬────────────────┬────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├──────────────┼────────────────┼────────┼────────────┼──────────────────┤ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 1428 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 225 │ +└──────────────┴────────────────┴────────┴────────────┴──────────────────┘ + +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.004 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌──────────────┬────────────────┬────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├──────────────┼────────────────┼────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 56 │ +│ qPoolIO0 │ input │ Pool │ 16 │ 56 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 225 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 1542 │ +└──────────────┴────────────────┴────────┴────────────┴──────────────────┘ + +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.004 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.005 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.005 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.010 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:42:22Z USER 10021 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:42:22Z INFO 10021 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:42:22Z USER 10021 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.000 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: nc_parallel_pass finished after 0.001 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.000 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:42:22Z USER 10021 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.022 seconds +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.024 seconds +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.026 seconds +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: curr_vmrss: 526mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:22Z USER 10021 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:42:22Z INFO 10021 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=52872 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z USER 10021 (nc00) [CoreForkPass]: Running dep_reduction +2025-11-04T21:42:22Z USER 10021 (nc01) [CoreForkPass]: Running dep_reduction +2025-11-04T21:42:22Z INFO 10021 (nc00) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:42:22Z INFO 10021 (nc01) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 46747 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 45600 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 48980 +2025-11-04T21:42:22Z INFO 10021 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 48980 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 47283 +2025-11-04T21:42:22Z INFO 10021 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 47283 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [DepReduction]: Finished dependency reduction: 114104 removed, new total 7417 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: dep_reduction finished after 0.178 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 536mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.006 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [DepReduction]: Finished dependency reduction: 119046 removed, new total 7693 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: dep_reduction finished after 0.187 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.006 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.009 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.003 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26238 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=5190 blocks=1 instructions=26238 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.011 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.003 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26634 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=5473 blocks=1 instructions=26634 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1400/1400 (100% DGE) + power-of-2 partition : 1428/1428 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1428/1428 (100% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + Spill/Reload + Copy (DGE/DMA) + 128 partition : 225/225 (100% DGE) + power-of-2 partition : 225/225 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 225/225 (100% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 0 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 0/0 + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: lower_dma finished after 0.015 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26247 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=5190 blocks=1 instructions=26247 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: expand_all_engine finished after 0.005 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 529mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26247 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=5190 blocks=1 instructions=26247 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1457/1457 (100% DGE) + power-of-2 partition : 1542/1654 (93.2285% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1542/1654 (93.2285% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + Spill/Reload + Copy (DGE/DMA) + 128 partition : 225/225 (100% DGE) + power-of-2 partition : 225/225 (100% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 225/225 (100% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 0 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 0/0 + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: lower_dma finished after 0.016 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26643 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=5473 blocks=1 instructions=26643 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: expand_all_engine finished after 0.004 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26643 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=5473 blocks=1 instructions=26643 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.025 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26247 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=5190 blocks=1 instructions=26247 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.024 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26643 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=5473 blocks=1 instructions=26643 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: expand_inst_late finished after 0.024 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26247 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=5190 blocks=1 instructions=26247 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.003 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 26247 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=5190 blocks=1 instructions=26247 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: expand_inst_late finished after 0.024 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26643 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=5473 blocks=1 instructions=26643 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [SeqInstOpt]: Removing 0 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.003 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 26643 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=5473 blocks=1 instructions=26643 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: lower_sync finished after 0.012 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 28361 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=5190 blocks=1 instructions=28361 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: lower_act finished after 0.004 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 28362 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=5190 blocks=1 instructions=28362 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: lower_sync finished after 0.012 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 28711 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=5473 blocks=1 instructions=28711 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: lower_act finished after 0.004 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 530mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 28712 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=5473 blocks=1 instructions=28712 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: lower_dve finished after 0.082 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 553mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 28362 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=5190 blocks=1 instructions=28362 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: lower_ap finished after 0.005 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 543mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 28362 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=5190 blocks=1 instructions=28362 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: lower_dve finished after 0.080 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 543mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 28712 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=5473 blocks=1 instructions=28712 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: lower_ap finished after 0.005 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 539mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 28712 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=5473 blocks=1 instructions=28712 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:42:23Z USER 10021 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.013 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: curr_vmrss: 539mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 28362 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:42:23Z USER 10021 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.012 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: curr_vmrss: 537mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 28712 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: nc_parallel_pass finished after 0.401 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 535mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: vnc_remote_addr_map finished after 0.001 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 533mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 57074 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: Running vnc_link +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 [VncLink]: Found 0 remote updates +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: vnc_link finished after 0.001 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 533mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 57074 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:23Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5473 blocks=1 instructions=28712 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=5190 blocks=1 instructions=28362 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.042 seconds +2025-11-04T21:42:23Z USER 10021 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.043 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 536mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 536mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 28712 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 28362 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.044 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 533mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:42:23Z INFO 10021 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.007 seconds +2025-11-04T21:42:23Z INFO 10021 (sg00) [SubgraphForkPass]: curr_vmrss: 533mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 57074 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: subgraph_parallel_pass finished after 0.008 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 533mb, ru_maxrss: 585mb (delta=0mb) +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 (nc00/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:42:23Z USER 10021 (nc01/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=5473 blocks=1 instructions=28712 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=5190 blocks=1 instructions=28362 Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: +┌───────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├───────────────┼─────────────┤ +│ ExternalInput │ 1.89232 │ +│ Const │ 3.05176e-05 │ +└───────────────┴─────────────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: +┌───────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├───────────────┼─────────────┤ +│ ExternalInput │ 1.89232 │ +│ Const │ 3.05176e-05 │ +└───────────────┴─────────────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ LDWEIGHTS │ 21504 │ +│ MATMUL │ 21504 │ +│ COPY │ 2912 │ +│ EVENT_SEMAPHORE │ 2114 │ +│ UNKNOWN(0xd4) │ 1653 │ +│ UNKNOWN(0xd8) │ 169 │ +│ NOP │ 8 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ACT_TABLE_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 2568 │ +│ Scalar │ 2172 │ +│ Tensor │ 43514 │ +│ SyncDMA │ 0 │ +│ Vector │ 1368 │ +│ Sync │ 253 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ LDWEIGHTS │ 21561 │ +│ MATMUL │ 21561 │ +│ COPY │ 3025 │ +│ EVENT_SEMAPHORE │ 2068 │ +│ UNKNOWN(0xd4) │ 1767 │ +│ UNKNOWN(0xd8) │ 169 │ +│ PSEUDO_DMA_TRIGGER │ 112 │ +│ NOP │ 8 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ ACT_TABLE_LOAD │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 2663 │ +│ Scalar │ 2392 │ +│ Tensor │ 43646 │ +│ SyncDMA │ 0 │ +│ Vector │ 1191 │ +│ Sync │ 390 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:42:23Z USER 10021 (nc01/sg00) [Codegen]: isa_gen finished after 0.192 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +└────────────────┴────────────────┘ + +Total descriptors: 0 (0 GB) +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: Number of DMA engines used by each queue: +┌──────────────┬─────────────────────┐ +│ Queue │ DMA Engines │ +├──────────────┼─────────────────────┤ +│ qPoolDynamic │ 16 │ +│ qSPDynamicHW │ 16 │ +├──────────────┼─────────────────────┤ +│ TOTAL │ 32 (must be <= 176) │ +└──────────────┴─────────────────────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: Tensors with largest descriptor count: +┌─────────────┬──────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +└─────────────┴──────┴──────────┴──────────────────┘ + +2025-11-04T21:42:23Z USER 10021 (nc01/sg00) [Codegen]: dma_desc_gen finished after 0.000 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [Codegen]: Generating debug info +2025-11-04T21:42:23Z USER 10021 (nc00/sg00) [Codegen]: isa_gen finished after 0.194 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├────────────────┼────────────────┤ +│ qPoolIO0 │ 112 │ +│ qSPIO0 │ 112 │ +└────────────────┴────────────────┘ + +Total descriptors: 224 (3.33786e-06 GB) +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: Number of DMA engines used by each queue: +┌──────────────┬─────────────────────┐ +│ Queue │ DMA Engines │ +├──────────────┼─────────────────────┤ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qSPIO0 │ 16 │ +│ qPoolIO0 │ 16 │ +├──────────────┼─────────────────────┤ +│ TOTAL │ 64 (must be <= 176) │ +└──────────────┴─────────────────────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ input26 │ ExternalInput │ bfloat16 │ 1 │ +│ _transpose.545-t33248_i0 │ Internal │ bfloat16 │ 1 │ +│ input215 │ ExternalInput │ bfloat16 │ 1 │ +│ transpose.446.37096_i0 │ Internal │ bfloat16 │ 1 │ +│ transpose.323.37050_i0 │ Internal │ bfloat16 │ 1 │ +│ transpose.389.37074_i0 │ Internal │ bfloat16 │ 1 │ +│ input4 │ ExternalInput │ bfloat16 │ 1 │ +│ input259 │ ExternalInput │ bfloat16 │ 1 │ +│ transpose.312.37046_i0 │ Internal │ bfloat16 │ 1 │ +│ input237 │ ExternalInput │ bfloat16 │ 1 │ +└──────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:42:23Z USER 10021 (nc00/sg00) [Codegen]: dma_desc_gen finished after 0.002 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [Codegen]: Generating debug info +2025-11-04T21:42:23Z USER 10021 (nc01/sg00) [Codegen]: debug_info_gen finished after 0.047 seconds +2025-11-04T21:42:23Z USER 10021 (nc00/sg00) [Codegen]: debug_info_gen finished after 0.047 seconds +2025-11-04T21:42:23Z USER 10021 (nc01/sg00) [ModuleForkPass]: codegen finished after 0.245 seconds +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 592mb, ru_maxrss: 592mb (delta=7mb) +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5190 memory location(s), 1 block(s), and 28362 instruction(s). Max writers: 8 Max Readers: 21504 +2025-11-04T21:42:23Z USER 10021 (nc00/sg00) [ModuleForkPass]: codegen finished after 0.251 seconds +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 562mb, ru_maxrss: 592mb (delta=7mb) +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 5473 memory location(s), 1 block(s), and 28712 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: mod_parallel_pass finished after 0.256 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 540mb, ru_maxrss: 592mb (delta=7mb) +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: Running hbm_usage +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 3.500KB │ 0.000B │ +│ CCE │ 0.000B │ 0.000B │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 28.000KB │ 0.000B │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc00/sg00) [HBMUsage]: +┌─────────────────────┬──────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼──────────┤ +│ Total: │ 1.907GB │ +│ Model Code │ 3.069MB │ +│ Model Constants │ 32.000KB │ +│ Unallocated Tensors │ 1.892GB │ +│ Allocated Tensors │ 12.000MB │ +│ DMA Ring IO │ 31.500KB │ +│ DMA Ring Spill │ 0.000B │ +└─────────────────────┴──────────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 0.000B │ 0.000B │ +│ CCE │ 0.000B │ 0.000B │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 0.000B │ 0.000B │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:42:23Z INFO 10021 (nc01/sg00) [HBMUsage]: +┌─────────────────────┬──────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼──────────┤ +│ Total: │ 1.907GB │ +│ Model Code │ 3.044MB │ +│ Model Constants │ 32.000KB │ +│ Unallocated Tensors │ 1.892GB │ +│ Allocated Tensors │ 12.000MB │ +│ DMA Ring IO │ 0.000B │ +│ DMA Ring Spill │ 0.000B │ +└─────────────────────┴──────────┘ + +2025-11-04T21:42:23Z INFO 10021 [HBMUsage]: Total estimated HBM usage is: 1.922GB +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: hbm_usage finished after 0.002 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 529mb, ru_maxrss: 592mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 57074 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: Running neff_packager +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=2 allocs=10663 blocks=2 instructions=57074 Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z WARNING 10021 [NeffFileWriter]: writeKelp missing file /home/ubuntu/neuronxcc-799a9j1_/metrics.json +2025-11-04T21:42:23Z WARNING 10021 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:42:23Z INFO 10021 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/graph.neff +2025-11-04T21:42:23Z INFO 10021 [NeffFileWriter]: IR signature: b9ba983939a7ca81358d9df72e2acdda for neff artifacts +2025-11-04T21:42:23Z USER 10021 [BackendPassManager]: neff_packager finished after 0.088 seconds +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: curr_vmrss: 530mb, ru_maxrss: 592mb (delta=0mb) +2025-11-04T21:42:23Z INFO 10021 [BackendPassManager]: Output has 2 module(s), 2 function(s), 10663 memory location(s), 2 block(s), and 57074 instruction(s). Max writers: 8 Max Readers: 21561 +2025-11-04T21:42:23Z INFO 10021 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.011719 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.328125 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.011719 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000000 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000000 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.011719 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:42:23Z INFO 10021 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬──────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼──────────────┤ +│ _transpose.316 │ bfloat16 │ 1 │ 12.000000 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴──────────────┘ + +2025-11-04T21:42:23Z INFO 10021 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:42:23Z INFO 9926 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:42:23Z INFO 9926 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/model/graph.hlo"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/neuronxcc-799a9j1_/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:42:23Z INFO 9926 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/neuronxcc-799a9j1_ +2025-11-04T21:42:23Z INFO 9926 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:42:23Z INFO 9926 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:42:23Z INFO 9926 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:42:23Z INFO 9926 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:42:23Z INFO 9926 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:42:23Z INFO 9926 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/model/graph.hlo --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/graph.neff --io_transposes /home/ubuntu/neuronxcc-799a9j1_/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/layout_opt/wrapped_neff.hlo --netlist /home/ubuntu/neuronxcc-799a9j1_/hlo_netlist.json +2025-11-04T21:42:23Z INFO 9926 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/neuronxcc-799a9j1_/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:42:23Z INFO 9926 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:42:23Z INFO 9926 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:42:23Z INFO 9914 [root]: Subcommand returned with exitcode=0 diff --git a/layout_opt/metaneff b/layout_opt/metaneff new file mode 100644 index 0000000000000000000000000000000000000000..23d7cf2253bfbb3252e51c87e2a9bfd4d3bdaa98 --- /dev/null +++ b/layout_opt/metaneff @@ -0,0 +1,934 @@ + +( +input0�� �2embed_tokens.weight8 +; +input1��2'layers.0.self_attn.o_proj.o_proj.weight8 += +input2��2)layers.0.self_attn.qkv_proj.v_proj.weight8 +1 +input3�2layers.0.input_layernorm.weight8 +7 +input4�2%layers.0.self_attn.k_layernorm.weight8 += +input5��2)layers.0.self_attn.qkv_proj.k_proj.weight8 +7 +input6�2%layers.0.self_attn.q_layernorm.weight8 += +input7��2)layers.0.self_attn.qkv_proj.q_proj.weight8 +1 +input8��2layers.0.mlp.down_proj.weight8 +/ +input9��2layers.0.mlp.up_proj.weight8 +; +input10�2(layers.0.post_attention_layernorm.weight8 +2 +input11��2layers.0.mlp.gate_proj.weight8 +< +input12��2'layers.1.self_attn.o_proj.o_proj.weight8 +> +input13��2)layers.1.self_attn.qkv_proj.v_proj.weight8 +2 +input14�2layers.1.input_layernorm.weight8 +8 +input15�2%layers.1.self_attn.k_layernorm.weight8 +> +input16��2)layers.1.self_attn.qkv_proj.k_proj.weight8 +8 +input17�2%layers.1.self_attn.q_layernorm.weight8 +> +input18��2)layers.1.self_attn.qkv_proj.q_proj.weight8 +2 +input19��2layers.1.mlp.down_proj.weight8 +0 +input20��2layers.1.mlp.up_proj.weight8 +; +input21�2(layers.1.post_attention_layernorm.weight8 +2 +input22��2layers.1.mlp.gate_proj.weight8 +< +input23��2'layers.2.self_attn.o_proj.o_proj.weight8 +> +input24��2)layers.2.self_attn.qkv_proj.v_proj.weight8 +2 +input25�2layers.2.input_layernorm.weight8 +8 +input26�2%layers.2.self_attn.k_layernorm.weight8 +> +input27��2)layers.2.self_attn.qkv_proj.k_proj.weight8 +8 +input28�2%layers.2.self_attn.q_layernorm.weight8 +> +input29��2)layers.2.self_attn.qkv_proj.q_proj.weight8 +2 +input30��2layers.2.mlp.down_proj.weight8 +0 +input31��2layers.2.mlp.up_proj.weight8 +; +input32�2(layers.2.post_attention_layernorm.weight8 +2 +input33��2layers.2.mlp.gate_proj.weight8 +< +input34��2'layers.3.self_attn.o_proj.o_proj.weight8 +> +input35��2)layers.3.self_attn.qkv_proj.v_proj.weight8 +2 +input36�2layers.3.input_layernorm.weight8 +8 +input37�2%layers.3.self_attn.k_layernorm.weight8 +> +input38��2)layers.3.self_attn.qkv_proj.k_proj.weight8 +8 +input39�2%layers.3.self_attn.q_layernorm.weight8 +> +input40��2)layers.3.self_attn.qkv_proj.q_proj.weight8 +2 +input41��2layers.3.mlp.down_proj.weight8 +0 +input42��2layers.3.mlp.up_proj.weight8 +; +input43�2(layers.3.post_attention_layernorm.weight8 +2 +input44��2layers.3.mlp.gate_proj.weight8 +< +input45��2'layers.4.self_attn.o_proj.o_proj.weight8 +> +input46��2)layers.4.self_attn.qkv_proj.v_proj.weight8 +2 +input47�2layers.4.input_layernorm.weight8 +8 +input48�2%layers.4.self_attn.k_layernorm.weight8 +> +input49��2)layers.4.self_attn.qkv_proj.k_proj.weight8 +8 +input50�2%layers.4.self_attn.q_layernorm.weight8 +> +input51��2)layers.4.self_attn.qkv_proj.q_proj.weight8 +2 +input52��2layers.4.mlp.down_proj.weight8 +0 +input53��2layers.4.mlp.up_proj.weight8 +; +input54�2(layers.4.post_attention_layernorm.weight8 +2 +input55��2layers.4.mlp.gate_proj.weight8 +< +input56��2'layers.5.self_attn.o_proj.o_proj.weight8 +> +input57��2)layers.5.self_attn.qkv_proj.v_proj.weight8 +2 +input58�2layers.5.input_layernorm.weight8 +8 +input59�2%layers.5.self_attn.k_layernorm.weight8 +> +input60��2)layers.5.self_attn.qkv_proj.k_proj.weight8 +8 +input61�2%layers.5.self_attn.q_layernorm.weight8 +> +input62��2)layers.5.self_attn.qkv_proj.q_proj.weight8 +2 +input63��2layers.5.mlp.down_proj.weight8 +0 +input64��2layers.5.mlp.up_proj.weight8 +; +input65�2(layers.5.post_attention_layernorm.weight8 +2 +input66��2layers.5.mlp.gate_proj.weight8 +< +input67��2'layers.6.self_attn.o_proj.o_proj.weight8 +> +input68��2)layers.6.self_attn.qkv_proj.v_proj.weight8 +2 +input69�2layers.6.input_layernorm.weight8 +8 +input70�2%layers.6.self_attn.k_layernorm.weight8 +> +input71��2)layers.6.self_attn.qkv_proj.k_proj.weight8 +8 +input72�2%layers.6.self_attn.q_layernorm.weight8 +> +input73��2)layers.6.self_attn.qkv_proj.q_proj.weight8 +2 +input74��2layers.6.mlp.down_proj.weight8 +0 +input75��2layers.6.mlp.up_proj.weight8 +; +input76�2(layers.6.post_attention_layernorm.weight8 +2 +input77��2layers.6.mlp.gate_proj.weight8 +< +input78��2'layers.7.self_attn.o_proj.o_proj.weight8 +> +input79��2)layers.7.self_attn.qkv_proj.v_proj.weight8 +2 +input80�2layers.7.input_layernorm.weight8 +8 +input81�2%layers.7.self_attn.k_layernorm.weight8 +> +input82��2)layers.7.self_attn.qkv_proj.k_proj.weight8 +8 +input83�2%layers.7.self_attn.q_layernorm.weight8 +> +input84��2)layers.7.self_attn.qkv_proj.q_proj.weight8 +2 +input85��2layers.7.mlp.down_proj.weight8 +0 +input86��2layers.7.mlp.up_proj.weight8 +; +input87�2(layers.7.post_attention_layernorm.weight8 +2 +input88��2layers.7.mlp.gate_proj.weight8 +< +input89��2'layers.8.self_attn.o_proj.o_proj.weight8 +> +input90��2)layers.8.self_attn.qkv_proj.v_proj.weight8 +2 +input91�2layers.8.input_layernorm.weight8 +8 +input92�2%layers.8.self_attn.k_layernorm.weight8 +> +input93��2)layers.8.self_attn.qkv_proj.k_proj.weight8 +8 +input94�2%layers.8.self_attn.q_layernorm.weight8 +> +input95��2)layers.8.self_attn.qkv_proj.q_proj.weight8 +2 +input96��2layers.8.mlp.down_proj.weight8 +0 +input97��2layers.8.mlp.up_proj.weight8 +; +input98�2(layers.8.post_attention_layernorm.weight8 +2 +input99��2layers.8.mlp.gate_proj.weight8 += +input100��2'layers.9.self_attn.o_proj.o_proj.weight8 +? +input101��2)layers.9.self_attn.qkv_proj.v_proj.weight8 +3 +input102�2layers.9.input_layernorm.weight8 +9 +input103�2%layers.9.self_attn.k_layernorm.weight8 +? +input104��2)layers.9.self_attn.qkv_proj.k_proj.weight8 +9 +input105�2%layers.9.self_attn.q_layernorm.weight8 +? +input106��2)layers.9.self_attn.qkv_proj.q_proj.weight8 +3 +input107��2layers.9.mlp.down_proj.weight8 +1 +input108��2layers.9.mlp.up_proj.weight8 +< +input109�2(layers.9.post_attention_layernorm.weight8 +3 +input110��2layers.9.mlp.gate_proj.weight8 +> +input111��2(layers.10.self_attn.o_proj.o_proj.weight8 +@ +input112��2*layers.10.self_attn.qkv_proj.v_proj.weight8 +4 +input113�2 layers.10.input_layernorm.weight8 +: +input114�2&layers.10.self_attn.k_layernorm.weight8 +@ +input115��2*layers.10.self_attn.qkv_proj.k_proj.weight8 +: +input116�2&layers.10.self_attn.q_layernorm.weight8 +@ +input117��2*layers.10.self_attn.qkv_proj.q_proj.weight8 +4 +input118��2layers.10.mlp.down_proj.weight8 +2 +input119��2layers.10.mlp.up_proj.weight8 += +input120�2)layers.10.post_attention_layernorm.weight8 +4 +input121��2layers.10.mlp.gate_proj.weight8 +> +input122��2(layers.11.self_attn.o_proj.o_proj.weight8 +@ +input123��2*layers.11.self_attn.qkv_proj.v_proj.weight8 +4 +input124�2 layers.11.input_layernorm.weight8 +: +input125�2&layers.11.self_attn.k_layernorm.weight8 +@ +input126��2*layers.11.self_attn.qkv_proj.k_proj.weight8 +: +input127�2&layers.11.self_attn.q_layernorm.weight8 +@ +input128��2*layers.11.self_attn.qkv_proj.q_proj.weight8 +4 +input129��2layers.11.mlp.down_proj.weight8 +2 +input130��2layers.11.mlp.up_proj.weight8 += +input131�2)layers.11.post_attention_layernorm.weight8 +4 +input132��2layers.11.mlp.gate_proj.weight8 +> +input133��2(layers.12.self_attn.o_proj.o_proj.weight8 +@ +input134��2*layers.12.self_attn.qkv_proj.v_proj.weight8 +4 +input135�2 layers.12.input_layernorm.weight8 +: +input136�2&layers.12.self_attn.k_layernorm.weight8 +@ +input137��2*layers.12.self_attn.qkv_proj.k_proj.weight8 +: +input138�2&layers.12.self_attn.q_layernorm.weight8 +@ +input139��2*layers.12.self_attn.qkv_proj.q_proj.weight8 +4 +input140��2layers.12.mlp.down_proj.weight8 +2 +input141��2layers.12.mlp.up_proj.weight8 += +input142�2)layers.12.post_attention_layernorm.weight8 +4 +input143��2layers.12.mlp.gate_proj.weight8 +> +input144��2(layers.13.self_attn.o_proj.o_proj.weight8 +@ +input145��2*layers.13.self_attn.qkv_proj.v_proj.weight8 +4 +input146�2 layers.13.input_layernorm.weight8 +: +input147�2&layers.13.self_attn.k_layernorm.weight8 +@ +input148��2*layers.13.self_attn.qkv_proj.k_proj.weight8 +: +input149�2&layers.13.self_attn.q_layernorm.weight8 +@ +input150��2*layers.13.self_attn.qkv_proj.q_proj.weight8 +4 +input151��2layers.13.mlp.down_proj.weight8 +2 +input152��2layers.13.mlp.up_proj.weight8 += +input153�2)layers.13.post_attention_layernorm.weight8 +4 +input154��2layers.13.mlp.gate_proj.weight8 +> +input155��2(layers.14.self_attn.o_proj.o_proj.weight8 +@ +input156��2*layers.14.self_attn.qkv_proj.v_proj.weight8 +4 +input157�2 layers.14.input_layernorm.weight8 +: +input158�2&layers.14.self_attn.k_layernorm.weight8 +@ +input159��2*layers.14.self_attn.qkv_proj.k_proj.weight8 +: +input160�2&layers.14.self_attn.q_layernorm.weight8 +@ +input161��2*layers.14.self_attn.qkv_proj.q_proj.weight8 +4 +input162��2layers.14.mlp.down_proj.weight8 +2 +input163��2layers.14.mlp.up_proj.weight8 += +input164�2)layers.14.post_attention_layernorm.weight8 +4 +input165��2layers.14.mlp.gate_proj.weight8 +> +input166��2(layers.15.self_attn.o_proj.o_proj.weight8 +@ +input167��2*layers.15.self_attn.qkv_proj.v_proj.weight8 +4 +input168�2 layers.15.input_layernorm.weight8 +: +input169�2&layers.15.self_attn.k_layernorm.weight8 +@ +input170��2*layers.15.self_attn.qkv_proj.k_proj.weight8 +: +input171�2&layers.15.self_attn.q_layernorm.weight8 +@ +input172��2*layers.15.self_attn.qkv_proj.q_proj.weight8 +4 +input173��2layers.15.mlp.down_proj.weight8 +2 +input174��2layers.15.mlp.up_proj.weight8 += +input175�2)layers.15.post_attention_layernorm.weight8 +4 +input176��2layers.15.mlp.gate_proj.weight8 +> +input177��2(layers.16.self_attn.o_proj.o_proj.weight8 +@ +input178��2*layers.16.self_attn.qkv_proj.v_proj.weight8 +4 +input179�2 layers.16.input_layernorm.weight8 +: +input180�2&layers.16.self_attn.k_layernorm.weight8 +@ +input181��2*layers.16.self_attn.qkv_proj.k_proj.weight8 +: +input182�2&layers.16.self_attn.q_layernorm.weight8 +@ +input183��2*layers.16.self_attn.qkv_proj.q_proj.weight8 +4 +input184��2layers.16.mlp.down_proj.weight8 +2 +input185��2layers.16.mlp.up_proj.weight8 += +input186�2)layers.16.post_attention_layernorm.weight8 +4 +input187��2layers.16.mlp.gate_proj.weight8 +> +input188��2(layers.17.self_attn.o_proj.o_proj.weight8 +@ +input189��2*layers.17.self_attn.qkv_proj.v_proj.weight8 +4 +input190�2 layers.17.input_layernorm.weight8 +: +input191�2&layers.17.self_attn.k_layernorm.weight8 +@ +input192��2*layers.17.self_attn.qkv_proj.k_proj.weight8 +: +input193�2&layers.17.self_attn.q_layernorm.weight8 +@ +input194��2*layers.17.self_attn.qkv_proj.q_proj.weight8 +4 +input195��2layers.17.mlp.down_proj.weight8 +2 +input196��2layers.17.mlp.up_proj.weight8 += +input197�2)layers.17.post_attention_layernorm.weight8 +4 +input198��2layers.17.mlp.gate_proj.weight8 +> +input199��2(layers.18.self_attn.o_proj.o_proj.weight8 +@ +input200��2*layers.18.self_attn.qkv_proj.v_proj.weight8 +4 +input201�2 layers.18.input_layernorm.weight8 +: +input202�2&layers.18.self_attn.k_layernorm.weight8 +@ +input203��2*layers.18.self_attn.qkv_proj.k_proj.weight8 +: +input204�2&layers.18.self_attn.q_layernorm.weight8 +@ +input205��2*layers.18.self_attn.qkv_proj.q_proj.weight8 +4 +input206��2layers.18.mlp.down_proj.weight8 +2 +input207��2layers.18.mlp.up_proj.weight8 += +input208�2)layers.18.post_attention_layernorm.weight8 +4 +input209��2layers.18.mlp.gate_proj.weight8 +> +input210��2(layers.19.self_attn.o_proj.o_proj.weight8 +@ +input211��2*layers.19.self_attn.qkv_proj.v_proj.weight8 +4 +input212�2 layers.19.input_layernorm.weight8 +: +input213�2&layers.19.self_attn.k_layernorm.weight8 +@ +input214��2*layers.19.self_attn.qkv_proj.k_proj.weight8 +: +input215�2&layers.19.self_attn.q_layernorm.weight8 +@ +input216��2*layers.19.self_attn.qkv_proj.q_proj.weight8 +4 +input217��2layers.19.mlp.down_proj.weight8 +2 +input218��2layers.19.mlp.up_proj.weight8 += +input219�2)layers.19.post_attention_layernorm.weight8 +4 +input220��2layers.19.mlp.gate_proj.weight8 +> +input221��2(layers.20.self_attn.o_proj.o_proj.weight8 +@ +input222��2*layers.20.self_attn.qkv_proj.v_proj.weight8 +4 +input223�2 layers.20.input_layernorm.weight8 +: +input224�2&layers.20.self_attn.k_layernorm.weight8 +@ +input225��2*layers.20.self_attn.qkv_proj.k_proj.weight8 +: +input226�2&layers.20.self_attn.q_layernorm.weight8 +@ +input227��2*layers.20.self_attn.qkv_proj.q_proj.weight8 +4 +input228��2layers.20.mlp.down_proj.weight8 +2 +input229��2layers.20.mlp.up_proj.weight8 += +input230�2)layers.20.post_attention_layernorm.weight8 +4 +input231��2layers.20.mlp.gate_proj.weight8 +> +input232��2(layers.21.self_attn.o_proj.o_proj.weight8 +@ +input233��2*layers.21.self_attn.qkv_proj.v_proj.weight8 +4 +input234�2 layers.21.input_layernorm.weight8 +: +input235�2&layers.21.self_attn.k_layernorm.weight8 +@ +input236��2*layers.21.self_attn.qkv_proj.k_proj.weight8 +: +input237�2&layers.21.self_attn.q_layernorm.weight8 +@ +input238��2*layers.21.self_attn.qkv_proj.q_proj.weight8 +4 +input239��2layers.21.mlp.down_proj.weight8 +2 +input240��2layers.21.mlp.up_proj.weight8 += +input241�2)layers.21.post_attention_layernorm.weight8 +4 +input242��2layers.21.mlp.gate_proj.weight8 +> +input243��2(layers.22.self_attn.o_proj.o_proj.weight8 +@ +input244��2*layers.22.self_attn.qkv_proj.v_proj.weight8 +4 +input245�2 layers.22.input_layernorm.weight8 +: +input246�2&layers.22.self_attn.k_layernorm.weight8 +@ +input247��2*layers.22.self_attn.qkv_proj.k_proj.weight8 +: +input248�2&layers.22.self_attn.q_layernorm.weight8 +@ +input249��2*layers.22.self_attn.qkv_proj.q_proj.weight8 +4 +input250��2layers.22.mlp.down_proj.weight8 +2 +input251��2layers.22.mlp.up_proj.weight8 += +input252�2)layers.22.post_attention_layernorm.weight8 +4 +input253��2layers.22.mlp.gate_proj.weight8 +> +input254��2(layers.23.self_attn.o_proj.o_proj.weight8 +@ +input255��2*layers.23.self_attn.qkv_proj.v_proj.weight8 +4 +input256�2 layers.23.input_layernorm.weight8 +: +input257�2&layers.23.self_attn.k_layernorm.weight8 +@ +input258��2*layers.23.self_attn.qkv_proj.k_proj.weight8 +: +input259�2&layers.23.self_attn.q_layernorm.weight8 +@ +input260��2*layers.23.self_attn.qkv_proj.q_proj.weight8 +4 +input261��2layers.23.mlp.down_proj.weight8 +2 +input262��2layers.23.mlp.up_proj.weight8 += +input263�2)layers.23.post_attention_layernorm.weight8 +4 +input264��2layers.23.mlp.gate_proj.weight8 +> +input265��2(layers.24.self_attn.o_proj.o_proj.weight8 +@ +input266��2*layers.24.self_attn.qkv_proj.v_proj.weight8 +4 +input267�2 layers.24.input_layernorm.weight8 +: +input268�2&layers.24.self_attn.k_layernorm.weight8 +@ +input269��2*layers.24.self_attn.qkv_proj.k_proj.weight8 +: +input270�2&layers.24.self_attn.q_layernorm.weight8 +@ +input271��2*layers.24.self_attn.qkv_proj.q_proj.weight8 +4 +input272��2layers.24.mlp.down_proj.weight8 +2 +input273��2layers.24.mlp.up_proj.weight8 += +input274�2)layers.24.post_attention_layernorm.weight8 +4 +input275��2layers.24.mlp.gate_proj.weight8 +> +input276��2(layers.25.self_attn.o_proj.o_proj.weight8 +@ +input277��2*layers.25.self_attn.qkv_proj.v_proj.weight8 +4 +input278�2 layers.25.input_layernorm.weight8 +: +input279�2&layers.25.self_attn.k_layernorm.weight8 +@ +input280��2*layers.25.self_attn.qkv_proj.k_proj.weight8 +: +input281�2&layers.25.self_attn.q_layernorm.weight8 +@ +input282��2*layers.25.self_attn.qkv_proj.q_proj.weight8 +4 +input283��2layers.25.mlp.down_proj.weight8 +2 +input284��2layers.25.mlp.up_proj.weight8 += +input285�2)layers.25.post_attention_layernorm.weight8 +4 +input286��2layers.25.mlp.gate_proj.weight8 +> +input287��2(layers.26.self_attn.o_proj.o_proj.weight8 +@ +input288��2*layers.26.self_attn.qkv_proj.v_proj.weight8 +4 +input289�2 layers.26.input_layernorm.weight8 +: +input290�2&layers.26.self_attn.k_layernorm.weight8 +@ +input291��2*layers.26.self_attn.qkv_proj.k_proj.weight8 +: +input292�2&layers.26.self_attn.q_layernorm.weight8 +@ +input293��2*layers.26.self_attn.qkv_proj.q_proj.weight8 +4 +input294��2layers.26.mlp.down_proj.weight8 +2 +input295��2layers.26.mlp.up_proj.weight8 += +input296�2)layers.26.post_attention_layernorm.weight8 +4 +input297��2layers.26.mlp.gate_proj.weight8 +> +input298��2(layers.27.self_attn.o_proj.o_proj.weight8 +@ +input299��2*layers.27.self_attn.qkv_proj.v_proj.weight8 +4 +input300�2 layers.27.input_layernorm.weight8 +: +input301�2&layers.27.self_attn.k_layernorm.weight8 +@ +input302��2*layers.27.self_attn.qkv_proj.k_proj.weight8 +: +input303�2&layers.27.self_attn.q_layernorm.weight8 +@ +input304��2*layers.27.self_attn.qkv_proj.q_proj.weight8 +4 +input305��2layers.27.mlp.down_proj.weight8 +2 +input306��2layers.27.mlp.up_proj.weight8 += +input307�2)layers.27.post_attention_layernorm.weight8 +4 +input308��2layers.27.mlp.gate_proj.weight8 +% +input309���2lm_head.weight8 + +input310�2 norm.weight8' +output0�� �2embed_tokens.weight: +output1��2'layers.0.self_attn.o_proj.o_proj.weight< +output2��2)layers.0.self_attn.qkv_proj.v_proj.weight0 +output3�2layers.0.input_layernorm.weight6 +output4�2%layers.0.self_attn.k_layernorm.weight< +output5��2)layers.0.self_attn.qkv_proj.k_proj.weight6 +output6�2%layers.0.self_attn.q_layernorm.weight< +output7��2)layers.0.self_attn.qkv_proj.q_proj.weight0 +output8��2layers.0.mlp.down_proj.weight. +output9��2layers.0.mlp.up_proj.weight: +output10�2(layers.0.post_attention_layernorm.weight1 +output11��2layers.0.mlp.gate_proj.weight; +output12��2'layers.1.self_attn.o_proj.o_proj.weight= +output13��2)layers.1.self_attn.qkv_proj.v_proj.weight1 +output14�2layers.1.input_layernorm.weight7 +output15�2%layers.1.self_attn.k_layernorm.weight= +output16��2)layers.1.self_attn.qkv_proj.k_proj.weight7 +output17�2%layers.1.self_attn.q_layernorm.weight= +output18��2)layers.1.self_attn.qkv_proj.q_proj.weight1 +output19��2layers.1.mlp.down_proj.weight/ +output20��2layers.1.mlp.up_proj.weight: +output21�2(layers.1.post_attention_layernorm.weight1 +output22��2layers.1.mlp.gate_proj.weight; +output23��2'layers.2.self_attn.o_proj.o_proj.weight= +output24��2)layers.2.self_attn.qkv_proj.v_proj.weight1 +output25�2layers.2.input_layernorm.weight7 +output26�2%layers.2.self_attn.k_layernorm.weight= +output27��2)layers.2.self_attn.qkv_proj.k_proj.weight7 +output28�2%layers.2.self_attn.q_layernorm.weight= +output29��2)layers.2.self_attn.qkv_proj.q_proj.weight1 +output30��2layers.2.mlp.down_proj.weight/ +output31��2layers.2.mlp.up_proj.weight: +output32�2(layers.2.post_attention_layernorm.weight1 +output33��2layers.2.mlp.gate_proj.weight; +output34��2'layers.3.self_attn.o_proj.o_proj.weight= +output35��2)layers.3.self_attn.qkv_proj.v_proj.weight1 +output36�2layers.3.input_layernorm.weight7 +output37�2%layers.3.self_attn.k_layernorm.weight= +output38��2)layers.3.self_attn.qkv_proj.k_proj.weight7 +output39�2%layers.3.self_attn.q_layernorm.weight= +output40��2)layers.3.self_attn.qkv_proj.q_proj.weight1 +output41��2layers.3.mlp.down_proj.weight/ +output42��2layers.3.mlp.up_proj.weight: +output43�2(layers.3.post_attention_layernorm.weight1 +output44��2layers.3.mlp.gate_proj.weight; +output45��2'layers.4.self_attn.o_proj.o_proj.weight= +output46��2)layers.4.self_attn.qkv_proj.v_proj.weight1 +output47�2layers.4.input_layernorm.weight7 +output48�2%layers.4.self_attn.k_layernorm.weight= +output49��2)layers.4.self_attn.qkv_proj.k_proj.weight7 +output50�2%layers.4.self_attn.q_layernorm.weight= +output51��2)layers.4.self_attn.qkv_proj.q_proj.weight1 +output52��2layers.4.mlp.down_proj.weight/ +output53��2layers.4.mlp.up_proj.weight: +output54�2(layers.4.post_attention_layernorm.weight1 +output55��2layers.4.mlp.gate_proj.weight; +output56��2'layers.5.self_attn.o_proj.o_proj.weight= +output57��2)layers.5.self_attn.qkv_proj.v_proj.weight1 +output58�2layers.5.input_layernorm.weight7 +output59�2%layers.5.self_attn.k_layernorm.weight= +output60��2)layers.5.self_attn.qkv_proj.k_proj.weight7 +output61�2%layers.5.self_attn.q_layernorm.weight= +output62��2)layers.5.self_attn.qkv_proj.q_proj.weight1 +output63��2layers.5.mlp.down_proj.weight/ +output64��2layers.5.mlp.up_proj.weight: +output65�2(layers.5.post_attention_layernorm.weight1 +output66��2layers.5.mlp.gate_proj.weight; +output67��2'layers.6.self_attn.o_proj.o_proj.weight= +output68��2)layers.6.self_attn.qkv_proj.v_proj.weight1 +output69�2layers.6.input_layernorm.weight7 +output70�2%layers.6.self_attn.k_layernorm.weight= +output71��2)layers.6.self_attn.qkv_proj.k_proj.weight7 +output72�2%layers.6.self_attn.q_layernorm.weight= +output73��2)layers.6.self_attn.qkv_proj.q_proj.weight1 +output74��2layers.6.mlp.down_proj.weight/ +output75��2layers.6.mlp.up_proj.weight: +output76�2(layers.6.post_attention_layernorm.weight1 +output77��2layers.6.mlp.gate_proj.weight; +output78��2'layers.7.self_attn.o_proj.o_proj.weight= +output79��2)layers.7.self_attn.qkv_proj.v_proj.weight1 +output80�2layers.7.input_layernorm.weight7 +output81�2%layers.7.self_attn.k_layernorm.weight= +output82��2)layers.7.self_attn.qkv_proj.k_proj.weight7 +output83�2%layers.7.self_attn.q_layernorm.weight= +output84��2)layers.7.self_attn.qkv_proj.q_proj.weight1 +output85��2layers.7.mlp.down_proj.weight/ +output86��2layers.7.mlp.up_proj.weight: +output87�2(layers.7.post_attention_layernorm.weight1 +output88��2layers.7.mlp.gate_proj.weight; +output89��2'layers.8.self_attn.o_proj.o_proj.weight= +output90��2)layers.8.self_attn.qkv_proj.v_proj.weight1 +output91�2layers.8.input_layernorm.weight7 +output92�2%layers.8.self_attn.k_layernorm.weight= +output93��2)layers.8.self_attn.qkv_proj.k_proj.weight7 +output94�2%layers.8.self_attn.q_layernorm.weight= +output95��2)layers.8.self_attn.qkv_proj.q_proj.weight1 +output96��2layers.8.mlp.down_proj.weight/ +output97��2layers.8.mlp.up_proj.weight: +output98�2(layers.8.post_attention_layernorm.weight1 +output99��2layers.8.mlp.gate_proj.weight< + output100��2'layers.9.self_attn.o_proj.o_proj.weight> + output101��2)layers.9.self_attn.qkv_proj.v_proj.weight2 + output102�2layers.9.input_layernorm.weight8 + output103�2%layers.9.self_attn.k_layernorm.weight> + output104��2)layers.9.self_attn.qkv_proj.k_proj.weight8 + output105�2%layers.9.self_attn.q_layernorm.weight> + output106��2)layers.9.self_attn.qkv_proj.q_proj.weight2 + output107��2layers.9.mlp.down_proj.weight0 + output108��2layers.9.mlp.up_proj.weight; + output109�2(layers.9.post_attention_layernorm.weight2 + output110��2layers.9.mlp.gate_proj.weight= + output111��2(layers.10.self_attn.o_proj.o_proj.weight? + output112��2*layers.10.self_attn.qkv_proj.v_proj.weight3 + output113�2 layers.10.input_layernorm.weight9 + output114�2&layers.10.self_attn.k_layernorm.weight? + output115��2*layers.10.self_attn.qkv_proj.k_proj.weight9 + output116�2&layers.10.self_attn.q_layernorm.weight? + output117��2*layers.10.self_attn.qkv_proj.q_proj.weight3 + output118��2layers.10.mlp.down_proj.weight1 + output119��2layers.10.mlp.up_proj.weight< + output120�2)layers.10.post_attention_layernorm.weight3 + output121��2layers.10.mlp.gate_proj.weight= + output122��2(layers.11.self_attn.o_proj.o_proj.weight? + output123��2*layers.11.self_attn.qkv_proj.v_proj.weight3 + output124�2 layers.11.input_layernorm.weight9 + output125�2&layers.11.self_attn.k_layernorm.weight? + output126��2*layers.11.self_attn.qkv_proj.k_proj.weight9 + output127�2&layers.11.self_attn.q_layernorm.weight? + output128��2*layers.11.self_attn.qkv_proj.q_proj.weight3 + output129��2layers.11.mlp.down_proj.weight1 + output130��2layers.11.mlp.up_proj.weight< + output131�2)layers.11.post_attention_layernorm.weight3 + output132��2layers.11.mlp.gate_proj.weight= + output133��2(layers.12.self_attn.o_proj.o_proj.weight? + output134��2*layers.12.self_attn.qkv_proj.v_proj.weight3 + output135�2 layers.12.input_layernorm.weight9 + output136�2&layers.12.self_attn.k_layernorm.weight? + output137��2*layers.12.self_attn.qkv_proj.k_proj.weight9 + output138�2&layers.12.self_attn.q_layernorm.weight? + output139��2*layers.12.self_attn.qkv_proj.q_proj.weight3 + output140��2layers.12.mlp.down_proj.weight1 + output141��2layers.12.mlp.up_proj.weight< + output142�2)layers.12.post_attention_layernorm.weight3 + output143��2layers.12.mlp.gate_proj.weight= + output144��2(layers.13.self_attn.o_proj.o_proj.weight? + output145��2*layers.13.self_attn.qkv_proj.v_proj.weight3 + output146�2 layers.13.input_layernorm.weight9 + output147�2&layers.13.self_attn.k_layernorm.weight? + output148��2*layers.13.self_attn.qkv_proj.k_proj.weight9 + output149�2&layers.13.self_attn.q_layernorm.weight? + output150��2*layers.13.self_attn.qkv_proj.q_proj.weight3 + output151��2layers.13.mlp.down_proj.weight1 + output152��2layers.13.mlp.up_proj.weight< + output153�2)layers.13.post_attention_layernorm.weight3 + output154��2layers.13.mlp.gate_proj.weight= + output155��2(layers.14.self_attn.o_proj.o_proj.weight? + output156��2*layers.14.self_attn.qkv_proj.v_proj.weight3 + output157�2 layers.14.input_layernorm.weight9 + output158�2&layers.14.self_attn.k_layernorm.weight? + output159��2*layers.14.self_attn.qkv_proj.k_proj.weight9 + output160�2&layers.14.self_attn.q_layernorm.weight? + output161��2*layers.14.self_attn.qkv_proj.q_proj.weight3 + output162��2layers.14.mlp.down_proj.weight1 + output163��2layers.14.mlp.up_proj.weight< + output164�2)layers.14.post_attention_layernorm.weight3 + output165��2layers.14.mlp.gate_proj.weight= + output166��2(layers.15.self_attn.o_proj.o_proj.weight? + output167��2*layers.15.self_attn.qkv_proj.v_proj.weight3 + output168�2 layers.15.input_layernorm.weight9 + output169�2&layers.15.self_attn.k_layernorm.weight? + output170��2*layers.15.self_attn.qkv_proj.k_proj.weight9 + output171�2&layers.15.self_attn.q_layernorm.weight? + output172��2*layers.15.self_attn.qkv_proj.q_proj.weight3 + output173��2layers.15.mlp.down_proj.weight1 + output174��2layers.15.mlp.up_proj.weight< + output175�2)layers.15.post_attention_layernorm.weight3 + output176��2layers.15.mlp.gate_proj.weight= + output177��2(layers.16.self_attn.o_proj.o_proj.weight? + output178��2*layers.16.self_attn.qkv_proj.v_proj.weight3 + output179�2 layers.16.input_layernorm.weight9 + output180�2&layers.16.self_attn.k_layernorm.weight? + output181��2*layers.16.self_attn.qkv_proj.k_proj.weight9 + output182�2&layers.16.self_attn.q_layernorm.weight? + output183��2*layers.16.self_attn.qkv_proj.q_proj.weight3 + output184��2layers.16.mlp.down_proj.weight1 + output185��2layers.16.mlp.up_proj.weight< + output186�2)layers.16.post_attention_layernorm.weight3 + output187��2layers.16.mlp.gate_proj.weight= + output188��2(layers.17.self_attn.o_proj.o_proj.weight? + output189��2*layers.17.self_attn.qkv_proj.v_proj.weight3 + output190�2 layers.17.input_layernorm.weight9 + output191�2&layers.17.self_attn.k_layernorm.weight? + output192��2*layers.17.self_attn.qkv_proj.k_proj.weight9 + output193�2&layers.17.self_attn.q_layernorm.weight? + output194��2*layers.17.self_attn.qkv_proj.q_proj.weight3 + output195��2layers.17.mlp.down_proj.weight1 + output196��2layers.17.mlp.up_proj.weight< + output197�2)layers.17.post_attention_layernorm.weight3 + output198��2layers.17.mlp.gate_proj.weight= + output199��2(layers.18.self_attn.o_proj.o_proj.weight? + output200��2*layers.18.self_attn.qkv_proj.v_proj.weight3 + output201�2 layers.18.input_layernorm.weight9 + output202�2&layers.18.self_attn.k_layernorm.weight? + output203��2*layers.18.self_attn.qkv_proj.k_proj.weight9 + output204�2&layers.18.self_attn.q_layernorm.weight? + output205��2*layers.18.self_attn.qkv_proj.q_proj.weight3 + output206��2layers.18.mlp.down_proj.weight1 + output207��2layers.18.mlp.up_proj.weight< + output208�2)layers.18.post_attention_layernorm.weight3 + output209��2layers.18.mlp.gate_proj.weight= + output210��2(layers.19.self_attn.o_proj.o_proj.weight? + output211��2*layers.19.self_attn.qkv_proj.v_proj.weight3 + output212�2 layers.19.input_layernorm.weight9 + output213�2&layers.19.self_attn.k_layernorm.weight? + output214��2*layers.19.self_attn.qkv_proj.k_proj.weight9 + output215�2&layers.19.self_attn.q_layernorm.weight? + output216��2*layers.19.self_attn.qkv_proj.q_proj.weight3 + output217��2layers.19.mlp.down_proj.weight1 + output218��2layers.19.mlp.up_proj.weight< + output219�2)layers.19.post_attention_layernorm.weight3 + output220��2layers.19.mlp.gate_proj.weight= + output221��2(layers.20.self_attn.o_proj.o_proj.weight? + output222��2*layers.20.self_attn.qkv_proj.v_proj.weight3 + output223�2 layers.20.input_layernorm.weight9 + output224�2&layers.20.self_attn.k_layernorm.weight? + output225��2*layers.20.self_attn.qkv_proj.k_proj.weight9 + output226�2&layers.20.self_attn.q_layernorm.weight? + output227��2*layers.20.self_attn.qkv_proj.q_proj.weight3 + output228��2layers.20.mlp.down_proj.weight1 + output229��2layers.20.mlp.up_proj.weight< + output230�2)layers.20.post_attention_layernorm.weight3 + output231��2layers.20.mlp.gate_proj.weight= + output232��2(layers.21.self_attn.o_proj.o_proj.weight? + output233��2*layers.21.self_attn.qkv_proj.v_proj.weight3 + output234�2 layers.21.input_layernorm.weight9 + output235�2&layers.21.self_attn.k_layernorm.weight? + output236��2*layers.21.self_attn.qkv_proj.k_proj.weight9 + output237�2&layers.21.self_attn.q_layernorm.weight? + output238��2*layers.21.self_attn.qkv_proj.q_proj.weight3 + output239��2layers.21.mlp.down_proj.weight1 + output240��2layers.21.mlp.up_proj.weight< + output241�2)layers.21.post_attention_layernorm.weight3 + output242��2layers.21.mlp.gate_proj.weight= + output243��2(layers.22.self_attn.o_proj.o_proj.weight? + output244��2*layers.22.self_attn.qkv_proj.v_proj.weight3 + output245�2 layers.22.input_layernorm.weight9 + output246�2&layers.22.self_attn.k_layernorm.weight? + output247��2*layers.22.self_attn.qkv_proj.k_proj.weight9 + output248�2&layers.22.self_attn.q_layernorm.weight? + output249��2*layers.22.self_attn.qkv_proj.q_proj.weight3 + output250��2layers.22.mlp.down_proj.weight1 + output251��2layers.22.mlp.up_proj.weight< + output252�2)layers.22.post_attention_layernorm.weight3 + output253��2layers.22.mlp.gate_proj.weight= + output254��2(layers.23.self_attn.o_proj.o_proj.weight? + output255��2*layers.23.self_attn.qkv_proj.v_proj.weight3 + output256�2 layers.23.input_layernorm.weight9 + output257�2&layers.23.self_attn.k_layernorm.weight? + output258��2*layers.23.self_attn.qkv_proj.k_proj.weight9 + output259�2&layers.23.self_attn.q_layernorm.weight? + output260��2*layers.23.self_attn.qkv_proj.q_proj.weight3 + output261��2layers.23.mlp.down_proj.weight1 + output262��2layers.23.mlp.up_proj.weight< + output263�2)layers.23.post_attention_layernorm.weight3 + output264��2layers.23.mlp.gate_proj.weight= + output265��2(layers.24.self_attn.o_proj.o_proj.weight? + output266��2*layers.24.self_attn.qkv_proj.v_proj.weight3 + output267�2 layers.24.input_layernorm.weight9 + output268�2&layers.24.self_attn.k_layernorm.weight? + output269��2*layers.24.self_attn.qkv_proj.k_proj.weight9 + output270�2&layers.24.self_attn.q_layernorm.weight? + output271��2*layers.24.self_attn.qkv_proj.q_proj.weight3 + output272��2layers.24.mlp.down_proj.weight1 + output273��2layers.24.mlp.up_proj.weight< + output274�2)layers.24.post_attention_layernorm.weight3 + output275��2layers.24.mlp.gate_proj.weight= + output276��2(layers.25.self_attn.o_proj.o_proj.weight? + output277��2*layers.25.self_attn.qkv_proj.v_proj.weight3 + output278�2 layers.25.input_layernorm.weight9 + output279�2&layers.25.self_attn.k_layernorm.weight? + output280��2*layers.25.self_attn.qkv_proj.k_proj.weight9 + output281�2&layers.25.self_attn.q_layernorm.weight? + output282��2*layers.25.self_attn.qkv_proj.q_proj.weight3 + output283��2layers.25.mlp.down_proj.weight1 + output284��2layers.25.mlp.up_proj.weight< + output285�2)layers.25.post_attention_layernorm.weight3 + output286��2layers.25.mlp.gate_proj.weight= + output287��2(layers.26.self_attn.o_proj.o_proj.weight? + output288��2*layers.26.self_attn.qkv_proj.v_proj.weight3 + output289�2 layers.26.input_layernorm.weight9 + output290�2&layers.26.self_attn.k_layernorm.weight? + output291��2*layers.26.self_attn.qkv_proj.k_proj.weight9 + output292�2&layers.26.self_attn.q_layernorm.weight? + output293��2*layers.26.self_attn.qkv_proj.q_proj.weight3 + output294��2layers.26.mlp.down_proj.weight1 + output295��2layers.26.mlp.up_proj.weight< + output296�2)layers.26.post_attention_layernorm.weight3 + output297��2layers.26.mlp.gate_proj.weight= + output298��2(layers.27.self_attn.o_proj.o_proj.weight? + output299��2*layers.27.self_attn.qkv_proj.v_proj.weight3 + output300�2 layers.27.input_layernorm.weight9 + output301�2&layers.27.self_attn.k_layernorm.weight? + output302��2*layers.27.self_attn.qkv_proj.k_proj.weight9 + output303�2&layers.27.self_attn.q_layernorm.weight? + output304��2*layers.27.self_attn.qkv_proj.q_proj.weight3 + output305��2layers.27.mlp.down_proj.weight1 + output306��2layers.27.mlp.up_proj.weight< + output307�2)layers.27.post_attention_layernorm.weight3 + output308��2layers.27.mlp.gate_proj.weight$ + output309���2lm_head.weight + output310�2 norm.weight \ No newline at end of file diff --git a/layout_opt/model/graph.hlo b/layout_opt/model/graph.hlo new file mode 100644 index 0000000000000000000000000000000000000000..da5f335a1c77780dad3b1ef39b532ec4080fcbc9 --- /dev/null +++ b/layout_opt/model/graph.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e7f2a0e3e653ae48e6ae990b886db92e8301ff7b717e503ed65441f10188f9 +size 197002 diff --git a/model.pt b/model.pt new file mode 100644 index 0000000000000000000000000000000000000000..8057bb6945b8dc8951621b587632323eda984273 --- /dev/null +++ b/model.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6aaf129bd22f3296bb9d7b32cf204828f8e75993888e7b6cfc51c4811c85537 +size 101703819 diff --git a/neuron_config.json b/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1cf47244feb09ec111ba29ab8d70740f712c6b56 --- /dev/null +++ b/neuron_config.json @@ -0,0 +1,222 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 8, + "bucket_n_active_tokens": false, + "buckets": [ + 4096 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 2, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": null, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 4096, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": null, + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk0/command.txt b/token_generation_model/_tp0_bk0/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..54bc1737a1ca6d2d72846926ab13ca091b35da8d --- /dev/null +++ b/token_generation_model/_tp0_bk0/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb --output model.MODULE_caeca0352a0240106f96+d5490f71.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --enable-internal-neff-wrapper --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk0/compile_flags.MODULE_caeca0352a0240106f96+d5490f71.json b/token_generation_model/_tp0_bk0/compile_flags.MODULE_caeca0352a0240106f96+d5490f71.json new file mode 100644 index 0000000000000000000000000000000000000000..991709dbcdb337b7bda09590c97a65651fca5748 --- /dev/null +++ b/token_generation_model/_tp0_bk0/compile_flags.MODULE_caeca0352a0240106f96+d5490f71.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=2", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/log-neuron-cc.txt", "--enable-internal-neff-wrapper"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk0/global_metric_store.json b/token_generation_model/_tp0_bk0/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..3159bc9b5ed4f8fd006f1f220057773d85bc3165 --- /dev/null +++ b/token_generation_model/_tp0_bk0/global_metric_store.json @@ -0,0 +1,590 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 97.6100845336914, + "StaticProfiler::AveragePartitionUtilization": 91.14380645751953, + "StaticProfiler::AveragePeUtilization": 81.960205078125, + "StaticProfiler::LocalizationEfficiency": 308.939208984375, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 318.39825439453125, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 1.7005221843719482, + "AffinePredicateResolution": 0.03520393371582031, + "AliasDependencyElimination": 0.0027341842651367188, + "AliasDependencyInduction": 0.2813143730163574, + "AliasDependencyReset": 0.2986414432525635, + "BFComputeCutting": 0.06860709190368652, + "BirCodeGenLoop": 1.6297953128814697, + "CCOpFusion": 0.5585761070251465, + "CanonicalizeConv": 0.00022400000307243317, + "CanonicalizeDAGForPGTiling": 0.1489403247833252, + "CanonicalizeForTensorizer": 0.0002570000069681555, + "CanonicalizeIR": 0.046598196029663086, + "Canonicalizer": 0.004408000037074089, + "CoalesceCCOp": 0.1664752960205078, + "CommuteConcat": 0.023354530334472656, + "DMALocalityOpt": 0.039147377014160156, + "DMAProfiler": 0.07851672172546387, + "DMATilingProfiler": 0.08825540542602539, + "DataLocalityOpt": 3.1695549488067627, + "DataStreaming": 0.15042948722839355, + "DeConcat": 0.045781612396240234, + "DeadCodeElimination": 0.024619340896606445, + "DeadStoreElimination": 0.7814517021179199, + "DelinearIndices": 0.46334409713745117, + "Delinearization": 0.10990715026855469, + "DelinearizeSPMD": 0.14683842658996582, + "DoNothing": 0.00035881996154785156, + "DramToDramTranspose": 0.2727994918823242, + "DumpGraphAndMetadata": 0.12869954109191895, + "EliminateDivs": 0.10750436782836914, + "ExpandBatchNorm": 0.04441714286804199, + "ExpandISAMacro": 0.08692693710327148, + "FactorizeBlkDims": 0.7402238845825195, + "FactorizeThreadAxesInFreeDims": 0.06503605842590332, + "FlattenMacroLoop": 0.08033251762390137, + "GenericAccessSimplifier": 0.02213764190673828, + "HoistCompute": 3.7999998312443495e-05, + "IdentifyCrossPassTensors": 0.00014699999883305281, + "InferInitValue": 1.2564187049865723, + "InferIntrinsicOnCC": 0.25655221939086914, + "InferNeuronTensor": 1.5402815341949463, + "InferNonlocalTensors": 2.825716495513916, + "InferPSumTensor": 1.1557531356811523, + "InferShardAxis": 3.487098455429077, + "InferSharedMemLoc": 0.10950160026550293, + "InlineNativeKernels": 0.05088615417480469, + "InsertCoreBarrier": 0.1348874568939209, + "InsertIOTransposes": 1.001678228378296, + "InsertImplicitShardAxisBeforeISel": 0.3830599784851074, + "InsertLocalTransposes": 0.8397743701934814, + "InsertOffloadedTransposes": 0.07152938842773438, + "LICM": 0.12596940994262695, + "LateLegalizeInst": 0.15051579475402832, + "LateLegalizePostSplit": 0.09937071800231934, + "LateLowerReshapeOp": 0.028991222381591797, + "LateLowerTensorOp": 0.22994065284729004, + "LateNeuronInstComb": 1.0390312671661377, + "LayoutPreprocessing": 0.68070387840271, + "LayoutPreprocessingAndAnalysis": 0.991814136505127, + "LayoutRequirementAnalysis": 0.29753780364990234, + "LegalizeCCOpLayout": 0.05271172523498535, + "LegalizeOpLevelAlias": 0.0189054012298584, + "LegalizePartitionReduce": 0.04260826110839844, + "LegalizeSundaAccess": 0.9825439453125, + "LegalizeSundaMacro": 0.6434056758880615, + "LegalizeType": 0.14621663093566895, + "LocalLayoutOpt": 0.5158607959747314, + "LoopFusion": 0.24005889892578125, + "LoopSplitting": 0.026287078857421875, + "LowerBroadcast": 0.057680368423461914, + "LowerCCOpBlockAxis": 0.1771547794342041, + "LowerComplexBroadcast": 0.06588888168334961, + "LowerIntrinsics": 1.200401782989502, + "LowerShardAxis": 0.20541048049926758, + "LowerTensorOp": 0.34528374671936035, + "LowerToSendRecv": 0.13611459732055664, + "LowerTranspose": 0.4788858890533447, + "MacroGeneration": 2.1315596103668213, + "MaskPropagation": 0.0985419750213623, + "MemcastMotion": 0.00010399999882793054, + "MemcpyElimination": 3.2312090396881104, + "MutateDataType": 0.030095577239990234, + "NeuronAliasDependencyInduction": 0.018324851989746094, + "NeuronAliasDependencyReset": 0.03435230255126953, + "NeuronInstComb": 0.32805895805358887, + "NeuronLICM": 0.26866912841796875, + "NeuronLoopFusion": 1.4137792587280273, + "NeuronLoopInterchange": 0.047950029373168945, + "NeuronSimplifier": 0.46399831771850586, + "NeuronSimplifyPredicates": 0.14104795455932617, + "NeuronValueNumbering": 0.11248421669006348, + "OptimizeAliasedCopyChain": 0.0093231201171875, + "OptimizeNKIKernels": 1.330866813659668, + "PAGLayoutOpt": 10.357507705688477, + "PComputeCutting": 0.263751745223999, + "PGLayoutTilingPipeline": 24.891653060913086, + "PGTiling": 4.457480430603027, + "PadElimination": 0.009973287582397461, + "ParAxesAnnotation": 9.500602722167969, + "PartialLoopFusion": 0.991065263748169, + "PartialSimdFusion": 0.5688676834106445, + "PenguinizeFunctions": 0.00012099999730708078, + "PerfectLoopNest": 0.06462287902832031, + "PruneFunctions": 0.0003819999983534217, + "RecognizeOpIdiom": 0.11909127235412598, + "Recompute": 0.005625009536743164, + "RelaxPredicates": 0.1096184253692627, + "Rematerialization": 0.13873934745788574, + "RemoveOptimizationBarriers": 0.00047400000039488077, + "RemoveShardedPartitionAxes": 0.6388535499572754, + "ReshapeWeights": 0.021928071975708008, + "ResolveAccessConflict": 0.1634044647216797, + "ResolveComplicatePredicates": 0.034531354904174805, + "RewriteReplicationMatmul": 0.03591442108154297, + "RewriteWeights": 0.06955599784851074, + "SFKVectorizer": 5.4648590087890625, + "ScatterMotion": 0.0027620000764727592, + "ShardingPropagationAnalysis": 0.5668590068817139, + "SimpleAllReduceTiling": 0.06376838684082031, + "Simplifier": 0.07685470581054688, + "SimplifyMacroPredicates": 0.25678086280822754, + "SimplifyNeuronTensor": 0.3546435832977295, + "SimplifySlice": 0.023622751235961914, + "SimplifyTensor": 0.2752950191497803, + "SpillPSum": 0.5363657474517822, + "SplitAPUnionSets": 0.7056703567504883, + "SplitAccGrp": 0.04378986358642578, + "StaticProfiler": 0.12368035316467285, + "StaticTransposeLocalTensor": 0.21344637870788574, + "SundaISel": 1.3892035484313965, + "TCTransform": 0.02511763572692871, + "TensorInitialization": 0.17117786407470703, + "TensorOpSimplifier": 0.15215492248535156, + "TensorOpTransform": 0.7248854637145996, + "TensorizerLegalizationPass": 0.00012700000661425292, + "TileCCOps": 0.2080836296081543, + "TilingProfiler": 0.3465256690979004, + "TransformConvOp": 0.0547327995300293, + "TritiumFusion": 0.8802590370178223, + "ValueNumbering": 0.06971311569213867, + "VectorizeDMA": 0.5965697765350342, + "VectorizeMatMult": 0.050161123275756836, + "VerifySupportedOps": 0.0001900000061141327, + "WeightCoalescing": 0.05644702911376953, + "ZeroSizeTensorElimination": 0.00035262107849121094, + "algsimp": 0.0011439999798312783, + "batchnorm_expander": 0.0005280000041238964, + "boundary-marker-removal": 0.00015799999528098851, + "call-inliner": 0.00013099999341648072, + "canonicalize-boundary-marker": 0.0002809999859891832, + "collective-stream-id-checker": 3.7000001611886546e-05, + "comparison-expander": 0.00020199999562464654, + "computation-deduplicator": 0.00034699999378062785, + "config-lowering": 0.00011999999696854502, + "constant_folding": 9.200000204145908e-05, + "cse": 0.00037900000461377203, + "dce": 2.300000051036477e-05, + "dynamic-slice-transpose": 9.000000136438757e-05, + "eliminate-redundant-compare": 7.699999696342275e-05, + "emit-offloaded-dropout": 0.0001289999927394092, + "flatten-call-graph": 0.00016199999663513154, + "fuse-send-recv": 0.0006859999848529696, + "hilo-conditional-to-select": 5.2999999752501026e-05, + "hilo::LegalizeAlias": 0.0021039999555796385, + "hilo::NeuronInstCombine": 0.0008660000166855752, + "hilo::NeuronOpFusion": 0.00038499999209307134, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00032500000088475645, + "hilo::ScheduleFusion": 2.700000004551839e-05, + "hilo::SixtyFourHack": 0.00019500000053085387, + "hilo::VerifyAliasing": 4.999999873689376e-05, + "hlo-mac-count": 0.0033499998971819878, + "io-con-pipe-begin": 3.999999989900971e-06, + "io-con-pipe-end": 0.0, + "io-layout-normalization": 0.0005000000237487257, + "legalize-ccops-for-tensorizer": 1.2000000424450263e-05, + "legalize-compare": 0.00015500000154133886, + "lower-argminmax-custom-call": 7.300000288523734e-05, + "map-inline": 0.00038899999344721437, + "metadata-naming": 0.0006970000104047358, + "mlir::detail::OpToOpPassAdaptor": 0.0001320000010309741, + "mlir::hlo::MhloToPyPenguin": 0.021909000352025032, + "mlir::mhlo::LowerComplexExtraPass": 0.0017920000245794654, + "mlir::mhlo::LowerComplexPass": 0.0021520000882446766, + "native-to-custom-softmax": 0.00017699999443721026, + "native-to-custom-softmax-dx": 0.0002229999954579398, + "neuron-hlo-verifier": 0.012144000269472599, + "operand_upcaster": 0.0004459999909158796, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.03227899968624115, + "pre-hlo-begin": 3.000000106112566e-06, + "pre-hlo-end": 0.0, + "replace-minimum-constant": 0.00010099999781232327, + "reshape-mover": 4.600000102072954e-05, + "simplify-concat": 0.0008660000166855752, + "simplify-while-loops": 3.199999991920777e-05, + "transform-variadic-reduce": 0.00023700000019744039, + "tuple-simplifier": 8.600000001024455e-05, + "unpack-nested-aws-ntwsr": 0.00013099999341648072, + "unroll-while-loop": 4.999999873689376e-06 + }, + "hilo": { + "HloMacCount": 6957531136.0, + "Traffic": 3915367168.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 84014, + "StaticProfiler::AifUb": 10.269017219543457, + "StaticProfiler::ArithmeticIntensityTensorizer": 31.725021362304688, + "StaticProfiler::AverageDmaLength": 2704.250244140625, + "StaticProfiler::DDRTransferBytes": 1874197460, + "StaticProfiler::InternalTransferBytes": 366183616, + "StaticProfiler::LoadExpanded": 534942, + "StaticProfiler::StoreExpanded": 22685, + "StaticProfiler::TotalDMAExpanded": 557627, + "StaticProfiler::TotalDynamicInstancesCount": 106305, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 91415, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 355, + "TilingProfiler::MatMultInstructionsAfterTiling": 63265, + "TilingProfiler::NumPfTransposes": 350, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 200, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 15058, + "TilingProfiler::PfTransposeInstructionsForIo": 11297, + "TilingProfiler::PfTransposeInstructionsForLocal": 1351, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 2410, + "TilingProfiler::ReduceInstructionsAfterTiling": 61, + "TilingProfiler::SimdInstructionsAfterTiling": 2597, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 0.00022400000307243317, + "CanonicalizeForTensorizer": 0.0002570000069681555, + "Canonicalizer": 0.004408000037074089, + "HoistCompute": 3.7999998312443495e-05, + "IdentifyCrossPassTensors": 0.00014699999883305281, + "MemcastMotion": 0.00010399999882793054, + "PenguinizeFunctions": 0.00012099999730708078, + "PruneFunctions": 0.0003819999983534217, + "RemoveOptimizationBarriers": 0.00047400000039488077, + "ScatterMotion": 0.0027620000764727592, + "TensorizerLegalizationPass": 0.00012700000661425292, + "VerifySupportedOps": 0.0001900000061141327, + "algsimp": 0.0011439999798312783, + "batchnorm_expander": 0.0005280000041238964, + "boundary-marker-removal": 0.00015799999528098851, + "call-inliner": 0.00013099999341648072, + "canonicalize-boundary-marker": 0.0002809999859891832, + "collective-stream-id-checker": 3.7000001611886546e-05, + "comparison-expander": 0.00020199999562464654, + "computation-deduplicator": 0.00034699999378062785, + "config-lowering": 0.00011999999696854502, + "constant_folding": 9.200000204145908e-05, + "cse": 0.00037900000461377203, + "dce": 2.300000051036477e-05, + "dynamic-slice-transpose": 9.000000136438757e-05, + "eliminate-redundant-compare": 7.699999696342275e-05, + "emit-offloaded-dropout": 0.0001289999927394092, + "flatten-call-graph": 0.00016199999663513154, + "fuse-send-recv": 0.0006859999848529696, + "hilo-conditional-to-select": 5.2999999752501026e-05, + "hilo::LegalizeAlias": 0.0021039999555796385, + "hilo::NeuronInstCombine": 0.0008660000166855752, + "hilo::NeuronOpFusion": 0.00038499999209307134, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00032500000088475645, + "hilo::ScheduleFusion": 2.700000004551839e-05, + "hilo::SixtyFourHack": 0.00019500000053085387, + "hilo::VerifyAliasing": 4.999999873689376e-05, + "hlo-mac-count": 0.0033499998971819878, + "io-con-pipe-begin": 3.999999989900971e-06, + "io-con-pipe-end": 0.0, + "io-layout-normalization": 0.0005000000237487257, + "legalize-ccops-for-tensorizer": 1.2000000424450263e-05, + "legalize-compare": 0.00015500000154133886, + "lower-argminmax-custom-call": 7.300000288523734e-05, + "map-inline": 0.00038899999344721437, + "metadata-naming": 0.0006970000104047358, + "mlir::detail::OpToOpPassAdaptor": 0.0001320000010309741, + "mlir::hlo::MhloToPyPenguin": 0.021909000352025032, + "mlir::mhlo::LowerComplexExtraPass": 0.0017920000245794654, + "mlir::mhlo::LowerComplexPass": 0.0021520000882446766, + "native-to-custom-softmax": 0.00017699999443721026, + "native-to-custom-softmax-dx": 0.0002229999954579398, + "neuron-hlo-verifier": 0.012144000269472599, + "operand_upcaster": 0.0004459999909158796, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.03227899968624115, + "pre-hlo-begin": 3.000000106112566e-06, + "pre-hlo-end": 0.0, + "replace-minimum-constant": 0.00010099999781232327, + "reshape-mover": 4.600000102072954e-05, + "simplify-concat": 0.0008660000166855752, + "simplify-while-loops": 3.199999991920777e-05, + "transform-variadic-reduce": 0.00023700000019744039, + "tuple-simplifier": 8.600000001024455e-05, + "unpack-nested-aws-ntwsr": 0.00013099999341648072, + "unroll-while-loop": 4.999999873689376e-06 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.00021648406982421875, + "DMALocalityOpt": 0.00018715858459472656, + "DMAProfiler": 0.0007541179656982422, + "DataStreaming": 0.0003209114074707031, + "DoNothing": 0.0001461505889892578, + "ExpandISAMacro": 0.0005350112915039063, + "FactorizeBlkDims": 0.00046825408935546875, + "InferPSumTensor": 0.0005578994750976563, + "InferSharedMemLoc": 0.0002923011779785156, + "InsertCoreBarrier": 0.0002646446228027344, + "LateLegalizeInst": 0.00041866302490234375, + "LateNeuronInstComb": 0.0005877017974853516, + "LegalizeSundaAccess": 0.001458883285522461, + "LegalizeType": 0.0002894401550292969, + "LowerBroadcast": 0.0002651214599609375, + "LowerIntrinsics": 0.00023031234741210938, + "LowerTranspose": 0.0002739429473876953, + "NeuronInstComb": 0.0006477832794189453, + "NeuronLICM": 0.0003752708435058594, + "NeuronSimplifyPredicates": 0.0022547245025634766, + "NeuronValueNumbering": 0.00046515464782714844, + "SFKVectorizer": 0.0027043819427490234, + "SimpleAllReduceTiling": 0.0002167224884033203, + "SimplifyNeuronTensor": 0.0005562305450439453, + "SpillPSum": 0.0004801750183105469, + "WeightCoalescing": 0.0002522468566894531 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 3.5539610385894775, + "HloMacCount": 6957531136.0, + "Traffic": 3915367168.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 1.7005221843719482, + "AffinePredicateResolution": 0.03520393371582031, + "AliasDependencyElimination": 0.0027341842651367188, + "AliasDependencyInduction": 0.2813143730163574, + "AliasDependencyReset": 0.2986414432525635, + "BFComputeCutting": 0.06860709190368652, + "BirCodeGenLoop": 1.6297953128814697, + "CCOpFusion": 0.5585761070251465, + "CanonicalizeDAGForPGTiling": 0.1489403247833252, + "CanonicalizeIR": 0.046598196029663086, + "CoalesceCCOp": 0.16363883018493652, + "CommuteConcat": 0.023354530334472656, + "DMALocalityOpt": 0.03702545166015625, + "DMAProfiler": 0.07511329650878906, + "DMATilingProfiler": 0.08825540542602539, + "DataLocalityOpt": 3.1695549488067627, + "DataStreaming": 0.1460282802581787, + "DeConcat": 0.045781612396240234, + "DeadCodeElimination": 0.024619340896606445, + "DeadStoreElimination": 0.7814517021179199, + "DelinearIndices": 0.46334409713745117, + "Delinearization": 0.10990715026855469, + "DelinearizeSPMD": 0.14683842658996582, + "DoNothing": 6.222724914550781e-05, + "DramToDramTranspose": 0.2727994918823242, + "DumpGraphAndMetadata": 0.12869954109191895, + "EliminateDivs": 0.10750436782836914, + "ExpandBatchNorm": 0.04441714286804199, + "ExpandISAMacro": 0.08370113372802734, + "FactorizeBlkDims": 0.7324323654174805, + "FactorizeThreadAxesInFreeDims": 0.06503605842590332, + "FlattenMacroLoop": 0.08033251762390137, + "GenericAccessSimplifier": 0.02213764190673828, + "InferInitValue": 1.2564187049865723, + "InferIntrinsicOnCC": 0.25655221939086914, + "InferNeuronTensor": 1.5402815341949463, + "InferNonlocalTensors": 2.825716495513916, + "InferPSumTensor": 1.1482248306274414, + "InferShardAxis": 3.487098455429077, + "InferSharedMemLoc": 0.10723090171813965, + "InlineNativeKernels": 0.05088615417480469, + "InsertCoreBarrier": 0.13246965408325195, + "InsertIOTransposes": 1.001678228378296, + "InsertImplicitShardAxisBeforeISel": 0.3830599784851074, + "InsertLocalTransposes": 0.8397743701934814, + "InsertOffloadedTransposes": 0.07152938842773438, + "LICM": 0.12596940994262695, + "LateLegalizeInst": 0.14527416229248047, + "LateLegalizePostSplit": 0.09937071800231934, + "LateLowerReshapeOp": 0.028991222381591797, + "LateLowerTensorOp": 0.22994065284729004, + "LateNeuronInstComb": 1.0325555801391602, + "LayoutPreprocessing": 0.68070387840271, + "LayoutPreprocessingAndAnalysis": 0.991814136505127, + "LayoutRequirementAnalysis": 0.29753780364990234, + "LegalizeCCOpLayout": 0.05271172523498535, + "LegalizeOpLevelAlias": 0.0189054012298584, + "LegalizePartitionReduce": 0.04260826110839844, + "LegalizeSundaAccess": 0.972419023513794, + "LegalizeSundaMacro": 0.6434056758880615, + "LegalizeType": 0.14038896560668945, + "LocalLayoutOpt": 0.5158607959747314, + "LoopFusion": 0.24005889892578125, + "LoopSplitting": 0.026287078857421875, + "LowerBroadcast": 0.05507779121398926, + "LowerCCOpBlockAxis": 0.1771547794342041, + "LowerComplexBroadcast": 0.06588888168334961, + "LowerIntrinsics": 1.1976900100708008, + "LowerShardAxis": 0.20541048049926758, + "LowerTensorOp": 0.34528374671936035, + "LowerToSendRecv": 0.13611459732055664, + "LowerTranspose": 0.47621941566467285, + "MacroGeneration": 2.1315596103668213, + "MaskPropagation": 0.0985419750213623, + "MemcpyElimination": 3.2312090396881104, + "MutateDataType": 0.030095577239990234, + "NeuronAliasDependencyInduction": 0.018324851989746094, + "NeuronAliasDependencyReset": 0.03435230255126953, + "NeuronInstComb": 0.3214244842529297, + "NeuronLICM": 0.2621889114379883, + "NeuronLoopFusion": 1.4137792587280273, + "NeuronLoopInterchange": 0.047950029373168945, + "NeuronSimplifier": 0.46399831771850586, + "NeuronSimplifyPredicates": 0.13630390167236328, + "NeuronValueNumbering": 0.10926556587219238, + "OptimizeAliasedCopyChain": 0.0093231201171875, + "OptimizeNKIKernels": 1.330866813659668, + "PAGLayoutOpt": 10.357507705688477, + "PComputeCutting": 0.263751745223999, + "PGLayoutTilingPipeline": 24.891653060913086, + "PGTiling": 4.457480430603027, + "PadElimination": 0.009973287582397461, + "ParAxesAnnotation": 9.500602722167969, + "PartialLoopFusion": 0.991065263748169, + "PartialSimdFusion": 0.5688676834106445, + "PerfectLoopNest": 0.06462287902832031, + "RecognizeOpIdiom": 0.11909127235412598, + "Recompute": 0.005625009536743164, + "RelaxPredicates": 0.1096184253692627, + "Rematerialization": 0.13873934745788574, + "RemoveShardedPartitionAxes": 0.6388535499572754, + "ReshapeWeights": 0.021928071975708008, + "ResolveAccessConflict": 0.1634044647216797, + "ResolveComplicatePredicates": 0.034531354904174805, + "RewriteReplicationMatmul": 0.03591442108154297, + "RewriteWeights": 0.06955599784851074, + "SFKVectorizer": 5.442811965942383, + "ShardingPropagationAnalysis": 0.5668590068817139, + "SimpleAllReduceTiling": 0.061127662658691406, + "Simplifier": 0.07685470581054688, + "SimplifyMacroPredicates": 0.25678086280822754, + "SimplifyNeuronTensor": 0.3089778423309326, + "SimplifySlice": 0.023622751235961914, + "SimplifyTensor": 0.2752950191497803, + "SpillPSum": 0.5238773822784424, + "SplitAPUnionSets": 0.7056703567504883, + "SplitAccGrp": 0.04378986358642578, + "StaticProfiler": 0.12368035316467285, + "StaticTransposeLocalTensor": 0.21344637870788574, + "SundaISel": 1.3892035484313965, + "TCTransform": 0.02511763572692871, + "TensorInitialization": 0.17117786407470703, + "TensorOpSimplifier": 0.15215492248535156, + "TensorOpTransform": 0.7248854637145996, + "TileCCOps": 0.2080836296081543, + "TilingProfiler": 0.3465256690979004, + "TransformConvOp": 0.0547327995300293, + "TritiumFusion": 0.8802590370178223, + "ValueNumbering": 0.06971311569213867, + "VectorizeDMA": 0.5965697765350342, + "VectorizeMatMult": 0.050161123275756836, + "WeightCoalescing": 0.05379343032836914, + "ZeroSizeTensorElimination": 0.00035262107849121094 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 84014, + "StaticProfiler::AifUb": 10.269017219543457, + "StaticProfiler::ArithmeticIntensityTensorizer": 31.725021362304688, + "StaticProfiler::AverageDmaLength": 2704.250244140625, + "StaticProfiler::AverageFractalPeUtilization": 97.6100845336914, + "StaticProfiler::AveragePartitionUtilization": 91.14380645751953, + "StaticProfiler::AveragePeUtilization": 81.960205078125, + "StaticProfiler::DDRTransferBytes": 1874197460, + "StaticProfiler::InternalTransferBytes": 366183616, + "StaticProfiler::LoadExpanded": 534942, + "StaticProfiler::LocalizationEfficiency": 308.939208984375, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 318.39825439453125, + "StaticProfiler::StoreExpanded": 22685, + "StaticProfiler::TotalDMAExpanded": 557627, + "StaticProfiler::TotalDynamicInstancesCount": 106305, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 91415, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 355, + "TilingProfiler::MatMultInstructionsAfterTiling": 63265, + "TilingProfiler::NumPfTransposes": 350, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 200, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 15058, + "TilingProfiler::PfTransposeInstructionsForIo": 11297, + "TilingProfiler::PfTransposeInstructionsForLocal": 1351, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 2410, + "TilingProfiler::ReduceInstructionsAfterTiling": 61, + "TilingProfiler::SimdInstructionsAfterTiling": 2597, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.0026199817657470703, + "DMALocalityOpt": 0.0019347667694091797, + "DMAProfiler": 0.0026493072509765625, + "DataStreaming": 0.004080295562744141, + "DoNothing": 0.00015044212341308594, + "ExpandISAMacro": 0.0026907920837402344, + "FactorizeBlkDims": 0.007323265075683594, + "InferPSumTensor": 0.006970405578613281, + "InferSharedMemLoc": 0.0019783973693847656, + "InsertCoreBarrier": 0.002153158187866211, + "LateLegalizeInst": 0.004822969436645508, + "LateNeuronInstComb": 0.0058879852294921875, + "LegalizeSundaAccess": 0.008666038513183594, + "LegalizeType": 0.005538225173950195, + "LowerBroadcast": 0.0023374557495117188, + "LowerIntrinsics": 0.0024814605712890625, + "LowerTranspose": 0.0023925304412841797, + "NeuronInstComb": 0.005986690521240234, + "NeuronLICM": 0.006104946136474609, + "NeuronSimplifyPredicates": 0.002489328384399414, + "NeuronValueNumbering": 0.0027534961700439453, + "SFKVectorizer": 0.01934218406677246, + "SimpleAllReduceTiling": 0.002424001693725586, + "SimplifyNeuronTensor": 0.04510951042175293, + "SpillPSum": 0.012008190155029297, + "WeightCoalescing": 0.0024013519287109375 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk0/graph.neff b/token_generation_model/_tp0_bk0/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..59cda9c9d4dd4bd28eb422ad1576ededc4bb3f2b --- /dev/null +++ b/token_generation_model/_tp0_bk0/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47fa87b2ad3d007fa8f4555b943efab99aeb4f481845eaaf73808db23aa72c4 +size 5970944 diff --git a/token_generation_model/_tp0_bk0/log-neuron-cc.txt b/token_generation_model/_tp0_bk0/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee6a00392ffd2ac78e3589e392fc76b8e94bbffe --- /dev/null +++ b/token_generation_model/_tp0_bk0/log-neuron-cc.txt @@ -0,0 +1,4243 @@ +2025-11-04T21:36:46Z INFO 8303 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/log-neuron-cc.txt --enable-internal-neff-wrapper --verbose=35 +2025-11-04T21:36:46Z INFO 8303 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:36:46Z INFO 8316 [root]: XLA detected +2025-11-04T21:36:46Z INFO 8316 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:36:46Z INFO 8316 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0 +2025-11-04T21:36:46Z INFO 8316 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:36:46Z INFO 8316 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:36:46Z INFO 8316 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:36:46Z INFO 8316 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:36:46Z INFO 8316 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:36:46Z INFO 8316 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:36:46Z INFO 8316 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:36:47Z INFO 8316 [job.HLOToTensorizer.0]: Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate parameter reduce reshape rng scatter select sine slice subtract transpose tuple +2025-11-04 21:36:46.979684: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:88] Could not open file debug_info_hlo_partitions.json +2025-11-04 21:36:46.982941: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.10713 = tuple(%reshape.3670, %scatter.9941, %scatter.9956, %scatter.9969, %scatter.9984, %scatter.9997, %scatter.10012, %scatter.10025, %scatter.10040, %scatter.10053, %scatter.10068, %scatter.10081, %scatter.10096, %scatter.10109, %scatter.10124, %scatter.10137, %scatter.10152, %scatter.10165, %scatter.10180, %scatter.10193, %scatter.10208, %scatter.10221, %scatter.10236, %scatter.10249, %scatter.10264, %scatter.10277, %scatter.10292, %scatter.10305, %scatter.10320, %scatter.10333, %scatter.10348... to 512 characters in the compiler's debug metadata +Transposable weight idxs: 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300,301,302,303,304,305,306,307,308,309,310,311,312,313,314,315,316,317,318,319,320,321,322,323,324,325,326,327,328,329,330,331,332,333,334,335,336,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370 +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:36:47Z INFO 8316 [job.HLOToTensorizer.0]: IR signature: feeeccb140e5038ab3e503e595a5de4b9b1f911e315b0efd39f4f2d115bf01a0 for sg0000/HLOToTensorizer +2025-11-04T21:36:47Z INFO 8316 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:36:47Z INFO 8316 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:36:47Z INFO 8316 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:36:47Z INFO 8316 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:36:47Z INFO 8316 [job.Frontend.0]: Processing input #0 +2025-11-04T21:36:47Z INFO 8316 [job.Frontend.0]: Start model loading +2025-11-04T21:36:47Z INFO 8316 [job.Frontend.0]: Start tensorization +2025-11-04T21:36:47Z INFO 8316 [job.Frontend.0]: Num jobs: 1 +2025-11-04T21:36:47Z USER 8316 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:36:47Z INFO 8316 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-11-04T21:36:47Z INFO 8316 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-11-04T21:36:48Z INFO 8316 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.019 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.009 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.038 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.055 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.345 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.194 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.212 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.053 seconds +2025-11-04T21:36:48Z INFO 8316 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.152 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.047 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.035 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.035 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.108 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.035 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.185 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.207 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.103 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.496 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.034 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.036 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.036 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.037 seconds +2025-11-04T21:36:49Z INFO 8316 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.044 seconds +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.037 seconds +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.034 seconds +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 0.646 seconds +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.079 seconds +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 0.725 seconds +2025-11-04T21:36:50Z INFO 8316 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.230 seconds +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.004 seconds +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.281 seconds +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.299 seconds +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:36:51Z INFO 8316 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:36:54Z INFO 8316 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 3.114 seconds +2025-11-04T21:36:54Z INFO 8316 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:36:54Z INFO 8316 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.116 seconds +2025-11-04T21:36:54Z INFO 8316 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:36:54Z INFO 8316 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 3.231 seconds +2025-11-04T21:36:54Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:36:54Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.447 seconds +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.214 seconds +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.158 seconds +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.305 seconds +2025-11-04T21:36:55Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.291 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 1.422 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.139 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.113 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.077 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.076 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.266 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.414 seconds +2025-11-04T21:36:56Z INFO 8316 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:36:57Z INFO 8316 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:36:57Z INFO 8316 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.065 seconds +2025-11-04T21:36:57Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:36:57Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.084 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.085 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LICM]: LICM finished after 0.057 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.096 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.219 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.093 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.190 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.507 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.024 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/LICM]: LICM finished after 0.044 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.079 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.078 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.157 seconds +2025-11-04T21:36:58Z INFO 8316 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.088 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LICM]: LICM finished after 0.043 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.010 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.090 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.084 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.152 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.240 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.022 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.077 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.077 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/LICM]: LICM finished after 0.043 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.070 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.025 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.023 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.023 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.119 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.119 seconds +2025-11-04T21:36:59Z INFO 8316 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.082 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 0.781 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.006 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.024 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.025 seconds +2025-11-04T21:37:00Z INFO 8316 [Tensorizer]: After optimization: 958 statements +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.030 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.022 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.077 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.077 seconds +2025-11-04T21:37:00Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=32768 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 8) %'all_gather.1' = AllGatherOp-402 AllGather_add(bfloat16 (1024, 8) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.50 | hlo_id: 50 | , id = 402 +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: in float32 (512, 8) %'all_gather.2' = AllGatherOp-7393 AllGather_add(float32 (256, 8) %'transpose.228', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9766 | hlo_id: 9766 | , id = 7393 +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=16384 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: in uint32 (512, 8) %'all_gather.3' = AllGatherOp-7409 AllGather_add(uint32 (256, 8) %'transpose.229', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9905 | hlo_id: 9905 | , id = 7409 +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.208 seconds +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.257 seconds +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.084 seconds +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.212 seconds +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.024 seconds +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.025 seconds +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.029 seconds +2025-11-04T21:37:01Z INFO 8316 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.257 seconds +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.026 seconds +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_1 +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_1 finished after 0.025 seconds +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.163 seconds +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/LICM]: LICM finished after 0.047 seconds +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.516 seconds +2025-11-04T21:37:02Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.230 seconds +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.184 seconds +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.090 seconds +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.681 seconds +2025-11-04T21:37:03Z INFO 8316 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:37:04Z INFO 8316 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.298 seconds +2025-11-04T21:37:04Z INFO 8316 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 0.992 seconds +2025-11-04T21:37:04Z INFO 8316 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:37:04Z INFO 8316 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:37:05Z INFO 8316 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:37:07Z INFO 8316 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:37:07Z INFO 8316 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 2.826 seconds +2025-11-04T21:37:07Z INFO 8316 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:37:07Z INFO 8316 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:37:07Z INFO 8316 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:37:16Z INFO 8316 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:37:16Z INFO 8316 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 9.501 seconds +2025-11-04T21:37:16Z INFO 8316 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.840 seconds +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 10.358 seconds +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.110 seconds +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.147 seconds +2025-11-04T21:37:17Z INFO 8316 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:37:18Z INFO 8316 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.567 seconds +2025-11-04T21:37:18Z INFO 8316 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:37:19Z INFO 8316 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:37:20Z INFO 8316 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 1215 +total number of sharded dags: 379 + +total bytes transferred from input, output, non local tensors: 1786198344 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 1754431620 +% bytes transferred with 2x bandwidths: 98.22 + +NC0 FLOPs: 6980708505 +NC1 FLOPs: 6916142464 +% FLOPs sharded: 99.52 + + +Shard dim: 2, Number of dags: 168 +Matmuls sharded with this dim: +[2,2(s),64] @ [2(s),64] = [2] Number of occurrences: 28 +[2,2(s),64] @ [2(s),64,128] = [2,128] Number of occurrences: 28 + + +Shard dim: 2048, Number of dags: 112 +Matmuls sharded with this dim: +[8,2048(s)] @ [2048(s),4,128] = [8,4,128] Number of occurrences: 28 +[8,2048(s)] @ [2048(s),4,2,2,64] = [8,4,2,2,64] Number of occurrences: 28 +[8,2048(s)] @ [2048(s),4,2,64] = [8,4,2,64] Number of occurrences: 28 +[8,4,2,128] @ [4,2,128,2048(s)] = [8,2048(s)] Number of occurrences: 28 + + +Shard dim: 3072, Number of dags: 84 +Matmuls sharded with this dim: +[8,2048] @ [2048,3072(s)] = [8,3072(s)] Number of occurrences: 56 +[8,3072(s)] @ [3072(s),2048] = [8,2048] Number of occurrences: 28 + + +Shard dim: 256, Number of dags: 10 +Matmuls sharded with this dim: + + +Shard dim: 8, Number of dags: 2 +Matmuls sharded with this dim: + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 512, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[8,2048] @ [2048,75968(s)] = [8,75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:37:20Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.463 seconds +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 0.639 seconds +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 3.487 seconds +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.099 seconds +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.149 seconds +2025-11-04T21:37:21Z INFO 8316 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:37:22Z INFO 8316 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:37:22Z INFO 8316 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.177 seconds +2025-11-04T21:37:22Z INFO 8316 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:37:22Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 7662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (132, 'AG2582'), (276, 'AG2575'), (160, 'AG2580')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 7956 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (133, 'AG2591'), (276, 'AG2575'), (161, 'AG2589')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8207 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (134, 'AG2600'), (276, 'AG2575'), (162, 'AG2598')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (135, 'AG2609'), (276, 'AG2575'), (163, 'AG2607')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8709 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (136, 'AG2618'), (276, 'AG2575'), (164, 'AG2616')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8960 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (137, 'AG2627'), (276, 'AG2575'), (165, 'AG2625')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9211 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (138, 'AG2636'), (276, 'AG2575'), (166, 'AG2634')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9462 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (139, 'AG2645'), (276, 'AG2575'), (167, 'AG2643')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (140, 'AG2654'), (276, 'AG2575'), (168, 'AG2652')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9964 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (141, 'AG2663'), (276, 'AG2575'), (169, 'AG2661')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10215 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (142, 'AG2672'), (276, 'AG2575'), (170, 'AG2670')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10466 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (143, 'AG2681'), (276, 'AG2575'), (171, 'AG2679')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (144, 'AG2690'), (276, 'AG2575'), (172, 'AG2688')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10968 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (145, 'AG2699'), (276, 'AG2575'), (173, 'AG2697')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11219 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (146, 'AG2708'), (276, 'AG2575'), (174, 'AG2706')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11470 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (147, 'AG2717'), (276, 'AG2575'), (175, 'AG2715')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11721 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (148, 'AG2726'), (276, 'AG2575'), (176, 'AG2724')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11972 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (149, 'AG2735'), (276, 'AG2575'), (177, 'AG2733')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12223 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (150, 'AG2744'), (276, 'AG2575'), (178, 'AG2742')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (151, 'AG2753'), (276, 'AG2575'), (179, 'AG2751')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12725 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (152, 'AG2762'), (276, 'AG2575'), (180, 'AG2760')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12976 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (153, 'AG2771'), (276, 'AG2575'), (181, 'AG2769')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13227 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (154, 'AG2780'), (276, 'AG2575'), (182, 'AG2778')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (155, 'AG2789'), (276, 'AG2575'), (183, 'AG2787')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (156, 'AG2798'), (276, 'AG2575'), (184, 'AG2796')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13980 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (157, 'AG2807'), (276, 'AG2575'), (185, 'AG2805')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14231 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (158, 'AG2816'), (276, 'AG2575'), (186, 'AG2814')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(8, 4, 32, 128, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (159, 'AG2825'), (276, 'AG2575'), (187, 'AG2823')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 7876 of IO tensor non_local bfloat16 %all_reduce.1(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 7909 of IO tensor non_local bfloat16 %all_reduce.3(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8127 of IO tensor non_local bfloat16 %all_reduce.5(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8160 of IO tensor non_local bfloat16 %all_reduce.7(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8378 of IO tensor non_local bfloat16 %all_reduce.9(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8411 of IO tensor non_local bfloat16 %all_reduce.11(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8629 of IO tensor non_local bfloat16 %all_reduce.13(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8662 of IO tensor non_local bfloat16 %all_reduce.15(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8880 of IO tensor non_local bfloat16 %all_reduce.17(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 8913 of IO tensor non_local bfloat16 %all_reduce.19(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9131 of IO tensor non_local bfloat16 %all_reduce.21(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9164 of IO tensor non_local bfloat16 %all_reduce.23(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9382 of IO tensor non_local bfloat16 %all_reduce.25(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9415 of IO tensor non_local bfloat16 %all_reduce.27(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9633 of IO tensor non_local bfloat16 %all_reduce.29(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9666 of IO tensor non_local bfloat16 %all_reduce.31(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9884 of IO tensor non_local bfloat16 %all_reduce.33(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9917 of IO tensor non_local bfloat16 %all_reduce.35(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10135 of IO tensor non_local bfloat16 %all_reduce.37(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10168 of IO tensor non_local bfloat16 %all_reduce.39(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10386 of IO tensor non_local bfloat16 %all_reduce.41(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10419 of IO tensor non_local bfloat16 %all_reduce.43(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10637 of IO tensor non_local bfloat16 %all_reduce.45(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10670 of IO tensor non_local bfloat16 %all_reduce.47(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10888 of IO tensor non_local bfloat16 %all_reduce.49(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10921 of IO tensor non_local bfloat16 %all_reduce.51(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11139 of IO tensor non_local bfloat16 %all_reduce.53(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11172 of IO tensor non_local bfloat16 %all_reduce.55(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11390 of IO tensor non_local bfloat16 %all_reduce.57(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11423 of IO tensor non_local bfloat16 %all_reduce.59(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11641 of IO tensor non_local bfloat16 %all_reduce.61(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11674 of IO tensor non_local bfloat16 %all_reduce.63(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11892 of IO tensor non_local bfloat16 %all_reduce.65(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11925 of IO tensor non_local bfloat16 %all_reduce.67(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12143 of IO tensor non_local bfloat16 %all_reduce.69(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12176 of IO tensor non_local bfloat16 %all_reduce.71(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12394 of IO tensor non_local bfloat16 %all_reduce.73(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12427 of IO tensor non_local bfloat16 %all_reduce.75(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12645 of IO tensor non_local bfloat16 %all_reduce.77(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12678 of IO tensor non_local bfloat16 %all_reduce.79(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12896 of IO tensor non_local bfloat16 %all_reduce.81(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12929 of IO tensor non_local bfloat16 %all_reduce.83(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13147 of IO tensor non_local bfloat16 %all_reduce.85(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13180 of IO tensor non_local bfloat16 %all_reduce.87(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13398 of IO tensor non_local bfloat16 %all_reduce.89(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13431 of IO tensor non_local bfloat16 %all_reduce.91(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13649 of IO tensor non_local bfloat16 %all_reduce.93(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13682 of IO tensor non_local bfloat16 %all_reduce.95(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13900 of IO tensor non_local bfloat16 %all_reduce.97(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13933 of IO tensor non_local bfloat16 %all_reduce.99(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14151 of IO tensor non_local bfloat16 %all_reduce.101(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14184 of IO tensor non_local bfloat16 %all_reduce.103(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14402 of IO tensor non_local bfloat16 %all_reduce.105(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14435 of IO tensor non_local bfloat16 %all_reduce.107(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14653 of IO tensor non_local bfloat16 %all_reduce.109(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14686 of IO tensor non_local bfloat16 %all_reduce.111(8, 2, 1024) is not sorted, index list (w/ AG ids): [(131, 'AG2576'), (1, 'AG2578')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14767 of IO tensor non_local float32 %get_tuple_element.3(8, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG2847'), (6, 'AG2846')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 20776 of IO tensor non_local uint32 %get_tuple_element.4(8, 2, 128) is not sorted, index list (w/ AG ids): [(42, 'AG2840'), (5, 'AG2844')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14742 of IO tensor non_local int32 %gather.2|NC|(8, 256) is not sorted, index list (w/ AG ids): [(42, 'AG2840'), (5, 'AG2844')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14785 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG2847'), (6, 'AG2846')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14802 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG2847'), (6, 'AG2846')] +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 1.701 seconds +2025-11-04T21:37:23Z INFO 8316 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.213 seconds +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.264 seconds +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.069 seconds +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.026 seconds +2025-11-04T21:37:24Z INFO 8316 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:37:26Z INFO 8316 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:37:26Z INFO 8316 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.132 seconds +2025-11-04T21:37:26Z INFO 8316 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 4.457 seconds +2025-11-04T21:37:26Z INFO 8316 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 1.002 seconds +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.072 seconds +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-11-04T21:37:27Z INFO 8316 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.273 seconds +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 24.892 seconds +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.347 seconds +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.244 seconds +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:37:28Z INFO 8316 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 1.484 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.055 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.540 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.244 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.244 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/LICM]: LICM finished after 0.077 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.036 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.135 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.101 seconds +2025-11-04T21:37:30Z INFO 8316 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:37:33Z INFO 8316 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:37:33Z INFO 8316 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 3.170 seconds +2025-11-04T21:37:33Z INFO 8316 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.088 seconds +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.294 seconds +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.295 seconds +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:37:34Z INFO 8316 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.643 seconds +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.383 seconds +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.461 seconds +2025-11-04T21:37:35Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.460 seconds +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.921 seconds +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.065 seconds +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.252 seconds +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.070 seconds +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.022 seconds +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.080 seconds +2025-11-04T21:37:36Z INFO 8316 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:37:37Z INFO 8316 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:37:37Z INFO 8316 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.257 seconds +2025-11-04T21:37:37Z INFO 8316 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.256 seconds +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.463 seconds +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.464 seconds +2025-11-04T21:37:38Z INFO 8316 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.074 seconds +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.275 seconds +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/LICM]: LICM finished after 0.126 seconds +2025-11-04T21:37:39Z INFO 8316 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.389 seconds +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.003 seconds +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.018 seconds +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.034 seconds +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.066 seconds +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.062 seconds +2025-11-04T21:37:40Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.197 seconds +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.598 seconds +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.231 seconds +2025-11-04T21:37:41Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.197 seconds +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.194 seconds +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_4 +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_4 finished after 0.192 seconds +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 1.414 seconds +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.048 seconds +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.231 seconds +2025-11-04T21:37:42Z INFO 8316 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:37:43Z INFO 8316 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:37:43Z INFO 8316 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.732 seconds +2025-11-04T21:37:43Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:37:43Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 1.660 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.165 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_2 +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_2 finished after 0.140 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.973 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.109 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.164 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.151 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.321 seconds +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:37:45Z INFO 8316 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.079 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.551 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.045 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.597 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.120 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.043 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.045 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.046 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.065 seconds +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:37:46Z INFO 8316 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:37:47Z INFO 8316 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.568 seconds +2025-11-04T21:37:47Z INFO 8316 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:37:47Z INFO 8316 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.569 seconds +2025-11-04T21:37:47Z INFO 8316 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 0.880 seconds +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.417 seconds +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.417 seconds +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.050 seconds +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:37:48Z INFO 8316 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:37:49Z INFO 8316 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 0.991 seconds +2025-11-04T21:37:49Z INFO 8316 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:37:49Z INFO 8316 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 0.991 seconds +2025-11-04T21:37:49Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.185 seconds +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.476 seconds +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.055 seconds +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:37:50Z INFO 8316 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.878 seconds +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.148 seconds +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 1.033 seconds +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.044 seconds +2025-11-04T21:37:51Z INFO 8316 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:37:52Z INFO 8316 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:37:52Z INFO 8316 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.524 seconds +2025-11-04T21:37:52Z INFO 8316 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.198 seconds +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.051 seconds +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.140 seconds +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.262 seconds +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:37:53Z INFO 8316 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:37:54Z INFO 8316 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.619 seconds +2025-11-04T21:37:54Z INFO 8316 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:37:55Z INFO 8316 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.528 seconds +2025-11-04T21:37:55Z INFO 8316 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:37:55Z INFO 8316 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.148 seconds +2025-11-04T21:37:55Z INFO 8316 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:37:55Z INFO 8316 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:37:55Z INFO 8316 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.054 seconds +2025-11-04T21:37:55Z INFO 8316 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.972 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.110 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.171 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.136 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.084 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.032 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.309 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.037 seconds +2025-11-04T21:37:56Z INFO 8316 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:37:57Z INFO 8316 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:37:57Z INFO 8316 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.146 seconds +2025-11-04T21:37:57Z INFO 8316 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:00Z INFO 8316 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 1.728 seconds +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.188 seconds +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 5.443 seconds +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.145 seconds +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.164 seconds +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.061 seconds +2025-11-04T21:38:02Z INFO 8316 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.132 seconds +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 13.540% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'26300.44774'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': '', 'transposable': True}bfloat16 (2, 37984, 16, 128) %'input369'[i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1,i2.16,i1.128] # id=44773, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_26300 | hlo_id: 14374 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 230.909us (2.344MiB, est bw: 10.643GB/s, 2.053% of tot. time) for float32<8 x 128> non_local float32 (8, 2, 37984) %'convert.656'[i1.8,i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1] = store float32<8 x 128> TongaSB partitions[2] float32 (2, 297, 8, 128) %'26813.44784'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i1.8,i0.128] # id=44782, src_id=None, , instances=600 # dl = tensor_op_name: convert.656_pftranspose_26813 | hlo_id: 14374 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i1.8];[i0.128]] -> [[i1.8];[i0.128]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 138.776us (32.031MiB, est bw: 242.024GB/s, 1.234% of tot. time) for bfloat16<128 x 8200> TongaSB partitions[2] bfloat16 (2, 8, 128, 8200) %'all_gather.1_nostride_49012'(init=0.0)[i242_0_31367,c0_28132,i0.128,i1.8200] = load bfloat16<128 x 8200> non_local bfloat16 (16384,) %'all_gather.1'[8i0.128+1024c0_28132+i1.8200] # id=36998, src_id=None, , attrs={'can_read_uninit': True}, instances=16 # dl = tensor_op_name: _add.395 | hlo_id: 395 | [[i0.128];[i1.8200]] -> [[i0.128];[i1.8200]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.523% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input68_local_28176'[i242_0_31367,4i243_0_0_0+i243_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (8, 2, 128, 2, 12, 128), 'transpose': [0, 3, 5, 4, 1, 2]}}bfloat16 (8, 2, 128, 3072) %'input68'[4i243_0_0_0+i243_0_0_1,i242_0_31367,i0.128,i1.3072] # id=37013, src_id=None, , instances=16 # dl = tensor_op_name: _dot.425 | hlo_id: 11374 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.523% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input79_local_28289'[i414_0_31430,4i415_0_0_0+i415_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (8, 2, 128, 2, 12, 128), 'transpose': [0, 3, 5, 4, 1, 2]}}bfloat16 (8, 2, 128, 3072) %'input79'[4i415_0_0_0+i415_0_0_1,i414_0_31430,i0.128,i1.3072] # id=37209, src_id=None, , instances=16 # dl = tensor_op_name: _dot.769 | hlo_id: 11485 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.523% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input90_local_28402'[i586_0_31493,4i587_0_0_0+i587_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (8, 2, 128, 2, 12, 128), 'transpose': [0, 3, 5, 4, 1, 2]}}bfloat16 (8, 2, 128, 3072) %'input90'[4i587_0_0_0+i587_0_0_1,i586_0_31493,i0.128,i1.3072] # id=37405, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1113 | hlo_id: 11596 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.523% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input101_local_28515'[i758_0_31556,4i759_0_0_0+i759_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (8, 2, 128, 2, 12, 128), 'transpose': [0, 3, 5, 4, 1, 2]}}bfloat16 (8, 2, 128, 3072) %'input101'[4i759_0_0_0+i759_0_0_1,i758_0_31556,i0.128,i1.3072] # id=37601, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1457 | hlo_id: 11707 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.523% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input112_local_28628'[i930_0_31619,4i931_0_0_0+i931_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (8, 2, 128, 2, 12, 128), 'transpose': [0, 3, 5, 4, 1, 2]}}bfloat16 (8, 2, 128, 3072) %'input112'[4i931_0_0_0+i931_0_0_1,i930_0_31619,i0.128,i1.3072] # id=37797, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1801 | hlo_id: 11818 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.523% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input123_local_28741'[i1102_0_31682,4i1103_0_0_0+i1103_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (8, 2, 128, 2, 12, 128), 'transpose': [0, 3, 5, 4, 1, 2]}}bfloat16 (8, 2, 128, 3072) %'input123'[4i1103_0_0_0+i1103_0_0_1,i1102_0_31682,i0.128,i1.3072] # id=37993, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2145 | hlo_id: 11929 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.523% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input134_local_28854'[i1274_0_31745,4i1275_0_0_0+i1275_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': '', 'transposable': True, 'static_io_transpose': {'reshape': (8, 2, 128, 2, 12, 128), 'transpose': [0, 3, 5, 4, 1, 2]}}bfloat16 (8, 2, 128, 3072) %'input134'[4i1275_0_0_0+i1275_0_0_1,i1274_0_31745,i0.128,i1.3072] # id=38189, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2489 | hlo_id: 12040 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.075 seconds +2025-11-04T21:38:03Z INFO 8316 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.016 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.048 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.024 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 5004) %4(init=0.0)[i0.128,i1.4748] = load float32<128 x 4748> float32 (128, 4748) %6[i0.128,i1.4748] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 4748) %10[i0.128,i1.4748] = load float32<128 x 4748> float32 (8, 75968) %'inp'[i0.128,i1.4748] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 5.874% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.012 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.012 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.007 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.045 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.019 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:03Z INFO 8316 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 12.331% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 288) %4(init=0.0)[i0.128,i1.32] = load float32<128 x 32> float32 (128, 32) %6[i0.128,i1.32] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 32) %10[i0.128,i1.32] = load float32<128 x 32> float32 (8, 512) %'inp'[i0.128,i1.32] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.003 seconds +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:38:04Z INFO 8316 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:38:04Z INFO 8316 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 1.331 seconds +2025-11-04T21:38:04Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:04Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:05Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.674 seconds +2025-11-04T21:38:05Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:38:05Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.675 seconds +2025-11-04T21:38:05Z INFO 8316 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:38:05Z WARNING 8316 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 75.49 percent of all matmul computation +2025-11-04T21:38:05Z INFO 8316 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:38:05Z INFO 8316 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.124 seconds +2025-11-04T21:38:05Z INFO 8316 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.706 seconds +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.099 seconds +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.107 seconds +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.205 seconds +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:38:06Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.558 seconds +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.559 seconds +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.129 seconds +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.136 seconds +2025-11-04T21:38:07Z INFO 8316 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:09Z INFO 8316 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:09Z INFO 8316 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.682 seconds +2025-11-04T21:38:10Z INFO 8316 [Tensorizer]: BirCodeGen estimate #instances=66814 in sg0000 +2025-11-04T21:38:10Z INFO 8316 [Tensorizer]: IR signature: 184a6ad87a70af4536d0bb65f3e6670853c62214c2b20fa1455320d00b0b50a8 for nc00/sg0000/TensorizerBIR +2025-11-04T21:38:10Z INFO 8316 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:38:12Z INFO 8316 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:38:12Z INFO 8316 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.630 seconds +2025-11-04T21:38:13Z INFO 8316 [Tensorizer]: BirCodeGen estimate #instances=66814 in sg0000 +2025-11-04T21:38:13Z INFO 8316 [Tensorizer]: IR signature: 58cc8d1a397c0711120250b7c31c1b25b82ebfc5cb8c667a07edc086a37d5813 for nc01/sg0000/TensorizerBIR +2025-11-04T21:38:13Z INFO 8316 [Tensorizer]: Weights total number of bytes: 2810120 +2025-11-04T21:38:13Z INFO 8316 [Tensorizer]: Successfully built model. +2025-11-04T21:38:13Z USER 8316 [root/Tensorizer/Tensorizer]: Tensorizer finished after 86.485 seconds +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: End tensorization +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input0 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input1 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input2 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input3 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input4 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input5 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input6 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input7 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input8 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input9 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input10 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input11 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input12 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input13 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input14 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input15 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input16 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input17 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input18 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input19 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input20 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input21 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input22 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input23 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input24 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input25 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input26 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input27 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input28 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input29 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input30 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input31 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input32 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input33 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input34 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input35 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input36 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input37 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input38 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input39 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input40 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input41 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input42 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input43 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input44 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input45 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input46 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input47 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input48 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input49 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input50 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input51 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input52 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input53 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input54 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input55 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input56 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input57 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input58 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input59 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input60 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input61 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input62 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input63 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input64 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input65 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input66 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input67 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input68 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input69 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input70 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input71 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input72 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input73 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input74 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input75 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input76 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input77 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input78 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input79 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input80 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input81 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input82 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input83 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input84 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input85 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input86 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input87 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input88 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input89 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input90 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input91 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input92 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input93 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input94 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input95 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input96 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input97 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input98 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input99 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input100 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input101 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input102 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input103 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input104 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input105 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input106 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input107 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input108 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input109 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input110 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input111 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input112 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input113 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input114 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input115 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input116 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input117 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input118 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input119 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input120 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input121 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input122 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input123 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input124 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input125 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input126 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input127 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input128 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input129 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input130 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input131 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input132 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input133 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input134 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input135 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input136 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input137 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input138 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input139 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input140 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input141 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input142 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input143 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input144 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input145 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input146 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input147 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input148 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input149 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input150 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input151 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input152 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input153 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input154 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input155 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input156 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input157 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input158 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input159 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input160 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input161 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input162 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input163 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input164 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input165 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input166 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input167 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input168 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input169 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input170 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input171 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input172 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input173 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input174 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input175 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input176 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input177 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input178 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input179 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input180 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input181 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input182 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input183 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input184 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input185 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input186 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input187 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input188 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input189 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input190 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input191 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input192 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input193 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input194 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input195 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input196 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input197 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input198 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input199 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input200 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input201 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input202 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input203 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input204 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input205 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input206 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input207 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input208 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input209 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input210 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input211 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input212 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input213 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input214 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input215 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input216 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input217 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input218 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input219 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input220 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input221 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input222 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input223 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input224 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input225 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input226 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input227 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input228 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input229 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input230 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input231 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input232 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input233 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input234 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input235 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input236 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input237 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input238 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input239 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input240 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input241 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input242 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input243 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input244 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input245 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input246 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input247 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input248 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input249 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input250 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input251 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input252 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input253 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input254 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input255 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input256 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input257 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input258 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input259 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input260 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input261 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input262 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input263 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input264 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input265 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input266 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input267 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input268 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input269 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input270 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input271 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input272 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input273 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input274 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input275 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input276 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input277 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input278 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input279 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input280 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input281 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input282 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input283 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input284 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input285 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input286 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input287 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input288 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input289 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input290 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input291 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input292 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input293 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input294 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input295 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input296 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input297 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input298 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input299 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input300 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input301 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input302 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input303 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input304 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input305 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input306 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input307 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input308 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input309 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input310 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input311 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input312 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input313 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input314 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input315 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input316 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input317 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input318 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input319 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input320 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input321 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input322 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input323 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input324 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input325 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input326 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input327 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input328 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input329 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input330 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input331 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input332 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input333 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input334 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input335 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input336 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input337 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input338 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input339 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input340 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input341 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input342 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input343 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input344 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input345 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input346 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input347 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input348 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input349 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input350 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input351 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input352 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input353 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input354 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input355 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input356 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input357 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input358 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input359 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input360 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input361 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input362 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input363 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input364 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input365 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input366 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input367 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input368 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input369 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Network input: input370 +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: wrote bir.json +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:38:13Z INFO 8316 [job.Frontend.0]: Job #0 finished +2025-11-04T21:38:13Z INFO 8316 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:38:13Z INFO 8316 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:38:13Z INFO 8316 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:38:13Z INFO 8316 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: BackendDriver has 2 states with 2 core LNC +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: BackendDriver: no partitions within VNC found. Switching to VNC + flat flow. +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: BackendDriver in_state.num_states 2 with 2 core LNC +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/log-neuron-cc.txt --vnc-nc-per-sengine 2 --link-subgraphs nc00/sg00,nc01/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --unified-backend-and-legacy-codegen --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels io,scalar_dynamic_offset,spill_reload,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:38:13Z INFO 8316 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:38:14Z INFO 8419 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:38:14Z INFO 8419 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:38:14Z INFO 8419 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:38:14Z INFO 8419 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:38:14Z INFO 8419 [BackendDriver]: Backend driver mtBackend: false numModules: 2 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge" +2025-11-04T21:38:14Z INFO 8419 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:38:14Z INFO 8419 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:38:14Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:14Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12058 blocks=2 instructions=10780 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:14Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:38:14Z USER 8419 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 365mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 365mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6029 memory location(s), 1 block(s), and 5390 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6029 memory location(s), 1 block(s), and 5390 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z WARNING 8419 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.375.52293}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:14Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.375.52293}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:38:14Z USER 8419 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.098 seconds +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6029 memory location(s), 1 block(s), and 5390 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.103 seconds +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6029 memory location(s), 1 block(s), and 5390 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:14Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.111 seconds +2025-11-04T21:38:14Z INFO 8419 [BackendPassManager]: curr_vmrss: 524mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z USER 8419 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:14Z INFO 8419 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=12058 blocks=2 instructions=10780 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:14Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=12058 blocks=2 instructions=10780 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.003 seconds +2025-11-04T21:38:14Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 524mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 12058 memory location(s), 2 block(s), and 10780 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:14Z USER 8419 [BackendPassManager]: subgraph_parallel_pass finished after 0.007 seconds +2025-11-04T21:38:14Z INFO 8419 [BackendPassManager]: curr_vmrss: 524mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:14Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12058 blocks=2 instructions=10780 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:14Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:14Z USER 8419 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:38:14Z USER 8419 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 524mb, ru_maxrss: 797mb (delta=0mb) +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6029 memory location(s), 1 block(s), and 5390 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6029 memory location(s), 1 block(s), and 5390 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z INFO 8419 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:14 2025 +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6029 blocks=1 instructions=5390 Max writers: 49 Max Readers: 341 +2025-11-04T21:38:14Z INFO 8419 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:38:14 2025 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:14 2025 + +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Total count: 54339 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Matmult: 42378 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: GenericCopy: 3965 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Load: 2224 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: TensorScalarPtr: 1967 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: TensorTensor: 1225 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Activation: 761 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Save: 338 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Memset: 245 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: CollectiveCompute: 232 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: CoreBarrier: 183 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Iota: 170 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Max: 128 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: DMACopy: 122 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Gather: 96 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: TensorReduce: 63 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Select: 30 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 122 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:38:14 2025 + +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.829 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1233mb, ru_maxrss: 1233mb (delta=436mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Total count: 54939 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Matmult: 42378 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: GenericCopy: 3965 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Load: 2224 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: TensorScalarPtr: 2191 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: TensorTensor: 1225 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Activation: 761 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Save: 490 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Iota: 282 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Memset: 245 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: DMACopy: 234 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: CollectiveCompute: 232 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: CoreBarrier: 183 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Max: 128 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Gather: 96 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: TensorReduce: 63 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Select: 30 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 234 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.847 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 856mb, ru_maxrss: 1233mb (delta=436mb) +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 30139 memory location(s), 1 block(s), and 54339 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 30139 memory location(s), 1 block(s), and 54939 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=30139 blocks=1 instructions=54339 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=30139 blocks=1 instructions=54939 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.123 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 859mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.126 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 859mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.982 seconds +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: curr_vmrss: 859mb, ru_maxrss: 1233mb (delta=436mb) +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25397 blocks=2 instructions=107959 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:15Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=25397 blocks=2 instructions=107959 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.003 seconds +2025-11-04T21:38:15Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 859mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25397 memory location(s), 2 block(s), and 107959 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: subgraph_parallel_pass finished after 0.007 seconds +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: curr_vmrss: 859mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25397 blocks=2 instructions=107959 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.425-t36551_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.425||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.769-t36559_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.769||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1113-t36567_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1113||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1457-t36575_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1457||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1801-t36583_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1801||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2145-t36591_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2145||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2489-t36599_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2489||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2833-t36607_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2833||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3177-t36615_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3177||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3521-t36623_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3521||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3865-t36631_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3865||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4209-t36639_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4209||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4553-t36647_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4553||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4897-t36655_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4897||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5241-t36663_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5241||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5585-t36671_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5585||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5929-t36679_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5929||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6273-t36687_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6273||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6617-t36695_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6617||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6961-t36703_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6961||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7305-t36711_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7305||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7649-t36719_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7649||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7993-t36727_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7993||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8337-t36735_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8337||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8681-t36743_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8681||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9025-t36751_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9025||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9369-t36759_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9369||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9713-t36767_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9713||UNDEF||[128, 128, 1]> +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_23591_33557_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:38:15Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_23595_33562_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.095 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 878mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.098 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 879mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.101 seconds +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: curr_vmrss: 879mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25397 blocks=2 instructions=107959 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:15Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=25397 blocks=2 instructions=107959 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.013 seconds +2025-11-04T21:38:15Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 879mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25397 memory location(s), 2 block(s), and 107959 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: subgraph_parallel_pass finished after 0.016 seconds +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: curr_vmrss: 879mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:15Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25397 blocks=2 instructions=107959 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.011 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.011 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.005 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.005 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.037 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.043 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.006 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z WARNING 8419 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.006 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 880mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z WARNING 8419 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 881mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 5 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:38:15Z INFO 8419 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:15Z INFO 8419 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.012 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.014 seconds +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.041 seconds +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 883mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:15Z INFO 8419 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:38:15Z INFO 8419 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.013 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.014 seconds +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.042 seconds +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 883mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:38:15Z INFO 8419 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:15Z INFO 8419 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.186 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 884mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.011 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 884mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.193 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 884mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.011 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 884mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.028 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 885mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.028 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.033 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.007 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.006 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12293 memory location(s), 1 block(s), and 53031 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.034 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=12293 blocks=1 instructions=53031 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.007 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: Found 225 Splits CCs +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: Grouped CCs to 225 clusters. +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.006 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 886mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13104 memory location(s), 1 block(s), and 54928 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=13104 blocks=1 instructions=54928 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: To Spill 3 multi-layer tensors +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: Found 225 Splits CCs +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: Grouped CCs to 225 clusters. +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: To Spill 3 multi-layer tensors +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:38:16Z INFO 8419 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [build_flow_deps]: Allocs: 12301 instructions: 53039 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [build_flow_deps]: Allocs: 13112 instructions: 54932 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 140922 edges +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [build_flow_deps]: Done build fdeps 140922 Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 153281 edges +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [build_flow_deps]: Done build fdeps 153281 Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.372 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 951mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12301 memory location(s), 1 block(s), and 53039 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=12301 blocks=1 instructions=53039 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 29 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:38:16 2025 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.393 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13112 memory location(s), 1 block(s), and 54932 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=13112 blocks=1 instructions=54932 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 59 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.115 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12272 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=12272 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12273 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=12273 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.006 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.006 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.119 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=13053 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13054 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=13054 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.006 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.006 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 952mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: size = 4454 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: found 6481 edges +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: mean: 2.91019 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: median: 1.12865 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 51848 bytes +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: size = 4598 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: found 6539 edges +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: mean: 2.84428 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: median: 1.04323 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 52312 bytes +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: lo = 4380 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: total = 4454 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.166 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 955mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: lo = 4524 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: total = 4598 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.168 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 956mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.073 seconds +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 956mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.079 seconds +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 957mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:38:16Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:16Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 10 PSUM Banks +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 284 PSUM Banks +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 348 PSUM Banks +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 534 PSUM Banks +2025-11-04T21:38:17Z USER 8419 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.265 seconds +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 960mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:17Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 926908046 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3470 bytes +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1772576 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 462 bytes +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 134688 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 34 bytes +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 443 PSUM Banks +2025-11-04T21:38:17Z USER 8419 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.274 seconds +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 960mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:17Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: size = 7190 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 932007606 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3479 bytes +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 3492970 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 295 bytes +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 367712 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 64 bytes +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: found 3863 accumulation groups +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: largest = _dot.9701-t33061_i23 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:17Z INFO 8419 []: find first defs for local +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: size = 7820 +2025-11-04T21:38:17Z INFO 8419 []: find first defs for global +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: found 4007 accumulation groups +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: largest = _dot.9701-t33061_i7 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:38:17Z INFO 8419 []: find first defs for local +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: 2201 remat count +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:17Z INFO 8419 []: find first defs for global +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Num intervals 7190 Num locations 7190 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: edge: 89809 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: mean: 24.9816 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: median: 15.9935 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: safe = 7175 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: unsafe = 12 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: inf = 1 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: total = 7188 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 7190 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: 2208 remat count +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Total: 7188 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (7188) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Rover zone: 0.857 (6162) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.083 (597) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.052 (377) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Slice zone: 0.007 (52) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.213 (1532) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.024 (173) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.635 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.666 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.678 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.763 (5483) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.686 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Num intervals 7820 Num locations 7820 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: edge: 97853 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: mean: 25.0263 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: median: 16.2958 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 926908046 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3470 bytes +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1772576 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 462 bytes +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 134688 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 34 bytes +2025-11-04T21:38:17Z USER 8419 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.310 seconds +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 968mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:17Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: safe = 7803 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: unsafe = 14 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: inf = 1 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: total = 7818 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 0 #Pinned 0 #Safe 0 minCost 1.79769e+308 maxCost 2.22507e-308 locations 7820 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: new candidates = 0 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Total: 7818 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (7818) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Rover zone: 0.853 (6665) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.087 (679) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.053 (414) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Slice zone: 0.008 (60) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.202 (1582) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.063 (494) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.365 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.325 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.686 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.734 (5742) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.649 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.997 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:17Z USER 8419 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.079 seconds +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 969mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12274 memory location(s), 1 block(s), and 53010 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:17Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=12274 blocks=1 instructions=53010 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 928680622, 97.416% input load, 0% output write, 2.58404% spill/reload [sg0000] +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(9.04683e+08) +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload memory locations +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 2048, 0.00853422% out of total spill/reload dma traffic +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 932007606 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3479 bytes +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 3492970 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 295 bytes +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 367712 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 64 bytes +2025-11-04T21:38:17Z USER 8419 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.643 seconds +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 972mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:17Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:17Z USER 8419 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.089 seconds +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 974mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13055 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:17Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=13055 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 935500576, 96.9814% input load, 3.42063e-06% output write, 3.0186% spill/reload [sg0000] +2025-11-04T21:38:17Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:17Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 3472 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 478 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 926907022 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3472 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1771552 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 478 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(9.07262e+08) +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 2048, 0.000220528% out of total dma traffic +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 928678574, 97.4162% input load, 0% output write, 2.58383% spill/reload [sg0000] +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 926907022 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3472 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1771552 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 478 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 134688 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 34 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3382 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.603 seconds +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 975mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53008 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=12270 blocks=1 instructions=53008 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 171 Sb address +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1599 Sb address +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 3479 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 295 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 932007606 +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3479 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 3492970 +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 295 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 935500576, 96.9814% input load, 3.42063e-06% output write, 3.0186% spill/reload [sg0000] +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 932007606 +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3479 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 3492970 +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 295 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 367712 +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 64 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3278 bytes +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:38:18Z USER 8419 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 0.600 seconds +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:18Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13053 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 224 Sb address +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 207 Sb address +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 278 Sb address +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1636 Sb address +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1443 Sb address +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.645 seconds +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 979mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53008 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=12270 blocks=1 instructions=53008 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: reserved space = 166144 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: spill space = 67616 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 73728 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: size = 3 +2025-11-04T21:38:18Z INFO 8419 []: find first defs for local +2025-11-04T21:38:18Z INFO 8419 []: find first defs for global +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: Num intervals 3 Num locations 3 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: lo = 3 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: total = 3 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 73728 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.104 seconds +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 980mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53008 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=12270 blocks=1 instructions=53008 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 69664 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 69664 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.062 seconds +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53008 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=12270 blocks=1 instructions=53008 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 929 out of 3979 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.010 seconds +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53008 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=12270 blocks=1 instructions=53008 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:18Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 322 Sb address +2025-11-04T21:38:18Z USER 8419 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.025 seconds +2025-11-04T21:38:18Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 53038, number of allocs: 12270 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.010054 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.006 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.006 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.006 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.035 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.008 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.425-t36551_i1}@SB<0,16928>(128x256)#Internal DebugInfo: <_dot.425||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.769-t36559_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.769||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1113-t36567_i1}@SB<0,23632>(128x256)#Internal DebugInfo: <_dot.1113||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1457-t36575_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.1457||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1801-t36583_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.1801||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2145-t36591_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.2145||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2489-t36599_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.2489||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2833-t36607_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.2833||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3177-t36615_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.3177||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3521-t36623_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.3521||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3865-t36631_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.3865||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4209-t36639_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.4209||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4553-t36647_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.4553||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4897-t36655_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.4897||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5241-t36663_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.5241||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5585-t36671_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.5585||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5929-t36679_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.5929||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6273-t36687_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.6273||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6617-t36695_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.6617||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6961-t36703_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.6961||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7305-t36711_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.7305||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7649-t36719_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.7649||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7993-t36727_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.7993||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8337-t36735_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.8337||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8681-t36743_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.8681||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9025-t36751_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.9025||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9369-t36759_i1}@SB<0,20560>(128x256)#Internal DebugInfo: <_dot.9369||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9713-t36767_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.9713||UNDEF||[128, 128, 1]> +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_23591_33557_i1}@SB<32,16552>(8x1024)#Internal DebugInfo: +2025-11-04T21:38:19Z WARNING 8419 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_23595_33562_i1}@SB<96,17672>(8x1024)#Internal DebugInfo: +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 218 Sb address +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.078 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.008 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 981mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:38:19 2025 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [build_flow_deps]: Allocs: 12270 instructions: 53038 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1443 Sb address +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.714 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 982mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=13053 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 140921 edges +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [build_flow_deps]: Done build fdeps 140921 Tue Nov 4 21:38:19 2025 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.124 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 982mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.024 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 982mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: reserved space = 166152 bytes +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: spill space = 101440 bytes +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 114688 bytes +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: size = 9 +2025-11-04T21:38:19Z INFO 8419 []: find first defs for local +2025-11-04T21:38:19Z INFO 8419 []: find first defs for global +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: Num intervals 9 Num locations 9 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: lo = 9 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: total = 9 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 77824 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.131 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1026mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=13053 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 77824 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 77824 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.090 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=13053 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 1013 out of 4140 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.015 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1056mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54873 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=13053 blocks=1 instructions=54873 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.031 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 54903, number of allocs: 13053 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.010505 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.007 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.006 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.006 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.044 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.305 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.008 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.083 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.088 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=12270 blocks=1 instructions=53038 Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.008 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:38:19 2025 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [build_flow_deps]: Allocs: 13053 instructions: 54903 +2025-11-04T21:38:19Z USER 8419 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.040 seconds +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1059mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12270 memory location(s), 1 block(s), and 53038 instruction(s). Max writers: 298 Max Readers: 8189 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 153252 edges +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [build_flow_deps]: Done build fdeps 153252 Tue Nov 4 21:38:19 2025 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.122 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1064mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.023 seconds +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1064mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:19Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.298 seconds +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.086 seconds +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=13053 blocks=1 instructions=54903 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.039 seconds +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13053 memory location(s), 1 block(s), and 54903 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 4.532 seconds +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25323 blocks=2 instructions=107941 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=25323 blocks=2 instructions=107941 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.004 seconds +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25323 memory location(s), 2 block(s), and 107941 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=25323 blocks=2 instructions=107941 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.046 seconds +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 108743 instruction(s). Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=25665 blocks=2 instructions=108743 Max writers: 298 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.111 seconds +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 108747 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: subgraph_parallel_pass finished after 0.170 seconds +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108747 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:20Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=12441 blocks=1 instructions=53441 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: reserved space = 233760 bytes +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: spill space = 6995970 bytes +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 7004160 bytes +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: reserved space = 267592 bytes +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: spill space = 6995970 bytes +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 7004160 bytes +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: size = 188 +2025-11-04T21:38:20Z INFO 8419 []: find first defs for local +2025-11-04T21:38:20Z USER 8419 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.054 seconds +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53441 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z INFO 8419 []: find first defs for global +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: Num intervals 188 Num locations 188 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: lo = 188 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: total = 188 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 77824 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 77824 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 3846144 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 3846144 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 6299648 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.141 seconds +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.145 seconds +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108747 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=25665 blocks=2 instructions=108747 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.008 seconds +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 108747 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: subgraph_parallel_pass finished after 0.012 seconds +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108747 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:20Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=12441 blocks=1 instructions=53441 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:20Z USER 8419 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.049 seconds +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53441 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.066 seconds +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.069 seconds +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108747 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:20Z USER 8419 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:38:20Z INFO 8419 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z INFO 8419 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=12441 blocks=1 instructions=53441 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z USER 8419 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.144 seconds +2025-11-04T21:38:20Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53441 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z USER 8419 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.147 seconds +2025-11-04T21:38:20Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: nc_parallel_pass finished after 0.152 seconds +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:20Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108747 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:20Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=12441 blocks=1 instructions=53441 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z USER 8419 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53441 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:38:20Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:20Z INFO 8419 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:20Z INFO 8419 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:20Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=12441 blocks=1 instructions=53441 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:20Z INFO 8419 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:38:20Z INFO 8419 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:38:20Z INFO 8419 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:20 2025 +2025-11-04T21:38:20Z INFO 8419 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:38:20 2025 +2025-11-04T21:38:21Z INFO 8419 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:21Z INFO 8419 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:38:22Z INFO 8419 [post_scheduler]: Time-aware simulation time: 4885951 +2025-11-04T21:38:22Z INFO 8419 [post_scheduler]: Time-aware simulation time: 5345098 +2025-11-04T21:38:22Z INFO 8419 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:22 2025 +2025-11-04T21:38:22Z USER 8419 (nc01/sg00) [ModuleForkPass]: post_sched finished after 1.653 seconds +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1207mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53441 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:22Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=12441 blocks=1 instructions=53441 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:22Z USER 8419 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.008 seconds +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1097mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53441 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:22Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=12441 blocks=1 instructions=53441 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:22Z USER 8419 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.050 seconds +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53409 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:22Z INFO 8419 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:38:22 2025 +2025-11-04T21:38:22Z USER 8419 (nc00/sg00) [ModuleForkPass]: post_sched finished after 1.760 seconds +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.008 seconds +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.052 seconds +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:22Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 1.828 seconds +2025-11-04T21:38:22Z INFO 8419 [BackendPassManager]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z USER 8419 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:22Z INFO 8419 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108715 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:38:22Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=25665 blocks=2 instructions=108715 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.004 seconds +2025-11-04T21:38:22Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 108715 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:22Z USER 8419 [BackendPassManager]: subgraph_parallel_pass finished after 0.008 seconds +2025-11-04T21:38:22Z INFO 8419 [BackendPassManager]: curr_vmrss: 1099mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:22Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:22Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108715 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:22Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:38:22Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:22Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=12441 blocks=1 instructions=53409 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2249 PSUM Banks +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2138 PSUM Banks +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 286 PSUM Banks +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 273 PSUM Banks +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2079 PSUM Banks +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2172 PSUM Banks +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 17 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 23 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 50 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 144 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 150 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 108 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 114 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1018 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: moved 18 MM forward +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1040 Sb address +2025-11-04T21:38:23Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: moved 18 MM forward +2025-11-04T21:38:23Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 1.407 seconds +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1104mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53409 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=12441 blocks=1 instructions=53409 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:38:24Z USER 8419 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 1.481 seconds +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1120mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:24Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.388 seconds +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1215mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53409 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=12441 blocks=1 instructions=53409 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:24Z USER 8419 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.422 seconds +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1137mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:24Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.133 seconds +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1067mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53409 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=12441 blocks=1 instructions=53409 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:38:24 2025 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [build_flow_deps]: Allocs: 12441 instructions: 53409 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 143379 edges +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [build_flow_deps]: Done build fdeps 143379 Tue Nov 4 21:38:24 2025 +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.188 seconds +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1082mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53409 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=12441 blocks=1 instructions=53409 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 149 │ 131072 │ +│ DMACopy │ Internal -> ExternalOutput │ 112 │ 3758096384 │ +│ Load │ Const -> Internal │ 5 │ 165120 │ +│ Load │ ExternalInput -> Internal │ 2066 │ 904517728 │ +│ Load │ Internal │ 124 │ 2276390 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 336 │ 1771552 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 69 │ +│ 8 │ 3 │ +│ 16 │ 4 │ +│ 32 │ 60 │ +│ 64 │ 31 │ +│ 88 │ 3 │ +│ 128 │ 924 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 305 │ +│ 1024 │ 2 │ +│ 2048 │ 29 │ +│ 4096 │ 353 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 532 │ +│ 16400 │ 8 │ +│ 1048576 │ 112 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 41754 #MatMult-Transposes 8201 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ReportStats]: IO Tensor size combined: 5789964480 +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_48896_i1 │ Internal │ bfloat16 │ 3153920 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ all_gather.1_nostride_49012_i10 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i12 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i9 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i11 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i14 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i13 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i8 │ Internal │ bfloat16 │ 2099200 │ +└─────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:24Z USER 8419 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.017 seconds +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1083mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53409 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:24Z USER 8419 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.292 seconds +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1090mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:24Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:38:24 2025 +2025-11-04T21:38:24Z INFO 8419 (nc00/sg00) [build_flow_deps]: Allocs: 13224 instructions: 55306 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 156838 edges +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [build_flow_deps]: Done build fdeps 156838 Tue Nov 4 21:38:25 2025 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.195 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 180 │ 139264 │ +│ DMACopy │ Internal -> ExternalOutput │ 224 │ 7516192768 │ +│ Load │ Const -> Internal │ 11 │ 2743560 │ +│ Load │ ExternalInput -> Internal │ 2066 │ 904517728 │ +│ Load │ Internal │ 136 │ 4798534 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 493 │ 3492938 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 32 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 74 │ +│ 8 │ 6 │ +│ 16 │ 4 │ +│ 32 │ 61 │ +│ 64 │ 91 │ +│ 88 │ 3 │ +│ 128 │ 982 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 307 │ +│ 1024 │ 17 │ +│ 2048 │ 29 │ +│ 4096 │ 381 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 532 │ +│ 16384 │ 2 │ +│ 16400 │ 8 │ +│ 18992 │ 2 │ +│ 1048576 │ 224 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 42374 #MatMult-Transposes 8761 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ReportStats]: IO Tensor size combined: 5789964480 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_48896_i0 │ Internal │ bfloat16 │ 3153920 │ +│ -t65818 │ Internal │ float32 │ 2562048 │ +│ -t65812 │ Internal │ float32 │ 2562048 │ +│ -t65815 │ Internal │ float32 │ 2430976 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ all_gather.1_nostride_49012_i3 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i2 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i1 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_49012_i0 │ Internal │ bfloat16 │ 2099200 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.018 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55306 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 2.430 seconds +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Inputs to assign_trigger_engine: modules=2 functions=2 allocs=25665 blocks=2 instructions=108715 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 564 DMA instructions. Moved 71 DMA instructions to CC's engines. +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 406 DMA instructions. Moved 70 DMA instructions to CC's engines. +2025-11-04T21:38:25Z INFO 8419 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: assign_trigger_engine finished after 0.055 seconds +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 108715 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108715 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=13224 blocks=1 instructions=55306 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=12441 blocks=1 instructions=53409 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.015 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.015 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.018 seconds +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=2 functions=2 allocs=25665 blocks=2 instructions=108833 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: assign_hwdge_engine finished after 0.015 seconds +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 108833 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108833 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 7 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 65 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 199 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 192 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 135 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 23 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2183 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 4 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.009 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 75 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 232 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 332 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 145 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 32 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2296 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 7 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.009 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.014 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.014 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.026 seconds +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108833 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:25Z USER 8419 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:38:25Z INFO 8419 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: nc_parallel_pass finished after 0.003 seconds +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108833 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:38:25Z USER 8419 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.077 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.079 seconds +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.083 seconds +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: curr_vmrss: 1091mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z USER 8419 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:38:25Z INFO 8419 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=108833 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z USER 8419 (nc00) [CoreForkPass]: Running dep_reduction +2025-11-04T21:38:25Z USER 8419 (nc01) [CoreForkPass]: Running dep_reduction +2025-11-04T21:38:25Z INFO 8419 (nc00) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 52212 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 56415 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 55955 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 55955 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 61330 +2025-11-04T21:38:25Z INFO 8419 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 61330 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Finished dependency reduction: 337601 removed, new total 14002 +2025-11-04T21:38:25Z INFO 8419 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:25Z USER 8419 (nc01) [CoreForkPass]: dep_reduction finished after 0.663 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1196mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.015 seconds +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1197mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:25Z USER 8419 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:25Z INFO 8419 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [DepReduction]: Finished dependency reduction: 362169 removed, new total 15081 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: dep_reduction finished after 0.708 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1197mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.027 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1196mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.007 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1196mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53468 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=12441 blocks=1 instructions=53468 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.016 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1195mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.030 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1194mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.007 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1194mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1948/1948 (100% DGE) + power-of-2 partition : 1948/1954 (99.6929% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1949/1955 (99.6931% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 27/27 (100% DGE) + power-of-2 partition : 27/477 (5.66038% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 27/477 (5.66038% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 141 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 121/121 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: lower_dma finished after 0.049 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1191mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55365 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53469 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=13224 blocks=1 instructions=55365 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=12441 blocks=1 instructions=53469 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: expand_all_engine finished after 0.011 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1191mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53469 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=12441 blocks=1 instructions=53469 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1948/1948 (100% DGE) + power-of-2 partition : 1948/1955 (99.6419% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1949/1956 (99.6421% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 39/39 (100% DGE) + power-of-2 partition : 39/652 (5.9816% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 39/652 (5.9816% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 169 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 234/234 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: lower_dma finished after 0.059 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1191mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55367 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=13224 blocks=1 instructions=55367 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: expand_all_engine finished after 0.011 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55367 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=13224 blocks=1 instructions=55367 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.072 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53469 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=12441 blocks=1 instructions=53469 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.071 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55367 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=13224 blocks=1 instructions=55367 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: expand_inst_late finished after 0.081 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53590 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=12441 blocks=1 instructions=53590 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [SeqInstOpt]: Removing 118 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.008 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 53472 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=12441 blocks=1 instructions=53472 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: lower_sync finished after 0.021 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 55960 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=12441 blocks=1 instructions=55960 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: lower_act finished after 0.009 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 56101 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=12441 blocks=1 instructions=56101 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: expand_inst_late finished after 0.084 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1172mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55601 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=13224 blocks=1 instructions=55601 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [SeqInstOpt]: Removing 230 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.009 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1173mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 55371 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=13224 blocks=1 instructions=55371 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: lower_sync finished after 0.024 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1174mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 58196 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=13224 blocks=1 instructions=58196 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: lower_act finished after 0.010 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1177mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 58338 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=13224 blocks=1 instructions=58338 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: lower_dve finished after 0.168 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1191mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 56101 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=12441 blocks=1 instructions=56101 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: lower_ap finished after 0.012 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1180mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 56101 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=12441 blocks=1 instructions=56101 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: size = 3 +2025-11-04T21:38:26Z INFO 8419 []: find first defs for local reg +2025-11-04T21:38:26Z INFO 8419 []: find first defs for global reg +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: lower_dve finished after 0.166 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1188mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 58338 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=13224 blocks=1 instructions=58338 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: lower_ap finished after 0.012 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1172mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 58338 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=13224 blocks=1 instructions=58338 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: size = 4 +2025-11-04T21:38:26Z INFO 8419 []: find first defs for local reg +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: find costs +2025-11-04T21:38:26Z INFO 8419 []: find first defs for global reg +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: lo = 3 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: total = 3 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: simplify +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: select ranges +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: no more spills +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:26Z USER 8419 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.144 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: curr_vmrss: 1187mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 56101 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: find costs +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: lo = 4 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: total = 4 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: simplify +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: select ranges +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: no more spills +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:38:26Z USER 8419 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.159 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: curr_vmrss: 1187mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 58338 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: nc_parallel_pass finished after 1.435 seconds +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: curr_vmrss: 1152mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: vnc_remote_addr_map finished after 0.005 seconds +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: curr_vmrss: 1130mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 114439 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: Running vnc_link +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 [VncLink]: Found 0 remote updates +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: vnc_link finished after 0.002 seconds +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: curr_vmrss: 1130mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 114439 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:26Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=13224 blocks=1 instructions=58338 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=12441 blocks=1 instructions=56101 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z USER 8419 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.115 seconds +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1138mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z USER 8419 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.116 seconds +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1128mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 56101 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 58338 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.119 seconds +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: curr_vmrss: 1128mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:38:26Z INFO 8419 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.020 seconds +2025-11-04T21:38:26Z INFO 8419 (sg00) [SubgraphForkPass]: curr_vmrss: 1128mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z INFO 8419 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 114439 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: subgraph_parallel_pass finished after 0.023 seconds +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: curr_vmrss: 1128mb, ru_maxrss: 1233mb (delta=0mb) +2025-11-04T21:38:26Z USER 8419 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:38:26Z INFO 8419 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z USER 8419 (nc00/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:38:26Z USER 8419 (nc01/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=13224 blocks=1 instructions=58338 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=12441 blocks=1 instructions=56101 Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:26Z INFO 8419 (nc00/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64232 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249506 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:38:26Z INFO 8419 (nc01/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64232 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249505 │ +└────────────────┴─────────────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 42779 │ +│ LDWEIGHTS │ 42779 │ +│ CAST │ 2782 │ +│ EVENT_SEMAPHORE │ 2488 │ +│ UNKNOWN(0xd4) │ 2210 │ +│ ACTIVATE │ 2064 │ +│ TENSOR_TENSOR │ 1224 │ +│ COPY │ 1000 │ +│ UNKNOWN(0xd8) │ 645 │ +│ PSEUDO_DMA_TRIGGER │ 598 │ +│ TENSOR_SCALAR_ADDR │ 337 │ +│ TENSOR_SCALAR │ 258 │ +│ MEMSET │ 227 │ +│ UNKNOWN(0xda) │ 180 │ +│ ACT_TABLE_LOAD │ 141 │ +│ IOTA │ 141 │ +│ UNKNOWN(0xd9) │ 59 │ +│ TENSOR_REDUCE │ 58 │ +│ RECIPROCAL │ 57 │ +│ UNKNOWN(0xe8) │ 30 │ +│ STREAM_SHUFFLE │ 24 │ +│ LOAD_MASK_SELECT │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 3 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 5018 │ +│ Scalar │ 6032 │ +│ Tensor │ 86179 │ +│ SyncDMA │ 0 │ +│ Vector │ 2732 │ +│ Sync │ 157 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 43459 │ +│ LDWEIGHTS │ 43459 │ +│ EVENT_SEMAPHORE │ 2825 │ +│ CAST │ 2782 │ +│ UNKNOWN(0xd4) │ 2335 │ +│ ACTIVATE │ 2127 │ +│ TENSOR_TENSOR │ 1225 │ +│ COPY │ 1150 │ +│ PSEUDO_DMA_TRIGGER │ 792 │ +│ UNKNOWN(0xd8) │ 645 │ +│ TENSOR_SCALAR_ADDR │ 562 │ +│ UNKNOWN(0xda) │ 293 │ +│ IOTA │ 282 │ +│ TENSOR_SCALAR │ 260 │ +│ POOL_BUFFER_LOAD │ 240 │ +│ GATHER │ 240 │ +│ MEMSET │ 239 │ +│ ACT_TABLE_LOAD │ 142 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ TENSOR_REDUCE │ 63 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 59 │ +│ UNKNOWN(0xe8) │ 30 │ +│ LOAD_MASK_SELECT │ 25 │ +│ STREAM_SHUFFLE │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 4 │ +│ UNKNOWN(0xe5) │ 2 │ +│ STREAM_TRANSPOSE │ 1 │ +│ NOP │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 6357 │ +│ Scalar │ 6453 │ +│ Tensor │ 87551 │ +│ SyncDMA │ 0 │ +│ Vector │ 3296 │ +│ Sync │ 192 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:38:27Z USER 8419 (nc01/sg00) [Codegen]: isa_gen finished after 0.421 seconds +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 3184 │ +│ qDVESpillReload0 │ 2160 │ +│ qPoolSpillReload0 │ 33250 │ +│ qSPIO0 │ 42 │ +│ qSPSpillReload0 │ 344269 │ +└───────────────────┴────────────────┘ + +Total descriptors: 382905 (0.00570573 GB) +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ 26453.43523_i1 │ Internal │ bfloat16 │ 1 │ +│ t32897_34327_i0_remote_0 │ Internal │ bfloat16 │ 1 │ +│ 26813.44784_i521 │ Internal │ float32 │ 1 │ +│ dot.172-buffer-65586 │ Internal │ bfloat16 │ 1 │ +│ broadcast_in_dim.17_i1 │ Internal │ int32 │ 2 │ +│ split_1 │ Internal │ int32 │ 2 │ +│ split_2 │ Internal │ float32 │ 2 │ +│ input2 │ ExternalInput │ int32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└──────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:27Z USER 8419 (nc00/sg00) [Codegen]: isa_gen finished after 0.431 seconds +2025-11-04T21:38:27Z USER 8419 (nc01/sg00) [Codegen]: dma_desc_gen finished after 0.010 seconds +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [Codegen]: Generating debug info +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 348564 │ +│ qDVESpillReload0 │ 345460 │ +│ qPoolSpillReload0 │ 44068 │ +│ qSPIO0 │ 51 │ +│ qSPSpillReload0 │ 344394 │ +└───────────────────┴────────────────┘ + +Total descriptors: 1082537 (0.0161311 GB) +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [Codegen]: Tensors with largest descriptor count: +┌────────────────┬──────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────┼──────────┼──────────┼──────────────────┤ +│ concatenate.10 │ Internal │ bfloat16 │ 3 │ +│ concatenate.19 │ Internal │ bfloat16 │ 3 │ +│ concatenate.18 │ Internal │ bfloat16 │ 3 │ +│ concatenate.12 │ Internal │ bfloat16 │ 3 │ +│ concatenate.17 │ Internal │ bfloat16 │ 3 │ +│ concatenate.14 │ Internal │ bfloat16 │ 3 │ +│ concatenate.15 │ Internal │ bfloat16 │ 3 │ +│ concatenate.13 │ Internal │ bfloat16 │ 3 │ +│ concatenate.16 │ Internal │ bfloat16 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└────────────────┴──────────┴──────────┴──────────────────┘ + +2025-11-04T21:38:27Z USER 8419 (nc00/sg00) [Codegen]: dma_desc_gen finished after 0.012 seconds +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [Codegen]: Generating debug info +2025-11-04T21:38:27Z WARNING 8419 (nc01/sg00) [Codegen]: Found 186 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:38:27Z USER 8419 (nc01/sg00) [Codegen]: debug_info_gen finished after 0.119 seconds +2025-11-04T21:38:27Z WARNING 8419 (nc00/sg00) [Codegen]: Found 169 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:38:27Z USER 8419 (nc00/sg00) [Codegen]: debug_info_gen finished after 0.118 seconds +2025-11-04T21:38:27Z USER 8419 (nc01/sg00) [ModuleForkPass]: codegen finished after 0.571 seconds +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1237mb, ru_maxrss: 1237mb (delta=4mb) +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12441 memory location(s), 1 block(s), and 56101 instruction(s). Max writers: 299 Max Readers: 8189 +2025-11-04T21:38:27Z USER 8419 (nc00/sg00) [ModuleForkPass]: codegen finished after 0.585 seconds +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1176mb, ru_maxrss: 1237mb (delta=4mb) +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13224 memory location(s), 1 block(s), and 58338 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:27Z USER 8419 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:38:27Z USER 8419 [BackendPassManager]: mod_parallel_pass finished after 0.597 seconds +2025-11-04T21:38:27Z INFO 8419 [BackendPassManager]: curr_vmrss: 1133mb, ru_maxrss: 1237mb (delta=4mb) +2025-11-04T21:38:27Z USER 8419 [BackendPassManager]: Running hbm_usage +2025-11-04T21:38:27Z INFO 8419 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 480.000B │ 448.156KB │ +│ CCE │ 0.000B │ 673.359KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 2.000KB │ 196.000KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc00/sg00) [HBMUsage]: +┌─────────────────────┬─────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼─────────┤ +│ Total: │ 3.658GB │ +│ Model Code │ 6.338MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.008MB │ +│ DMA Ring IO │ 2.469KB │ +│ DMA Ring Spill │ 1.287MB │ +└─────────────────────┴─────────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 448.000B │ 212.656KB │ +│ CCE │ 0.000B │ 505.359KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 1.750KB │ 147.750KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:38:27Z INFO 8419 (nc01/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.657GB │ +│ Model Code │ 6.111MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.008MB │ +│ DMA Ring IO │ 2.188KB │ +│ DMA Ring Spill │ 865.766KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:38:27Z INFO 8419 [HBMUsage]: Total estimated HBM usage is: 3.671GB +2025-11-04T21:38:27Z USER 8419 [BackendPassManager]: hbm_usage finished after 0.005 seconds +2025-11-04T21:38:27Z INFO 8419 [BackendPassManager]: curr_vmrss: 1115mb, ru_maxrss: 1237mb (delta=0mb) +2025-11-04T21:38:27Z INFO 8419 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 114439 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:27Z USER 8419 [BackendPassManager]: Running neff_packager +2025-11-04T21:38:27Z INFO 8419 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=2 allocs=25665 blocks=2 instructions=114439 Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:27Z WARNING 8419 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge/metrics.json +2025-11-04T21:38:27Z WARNING 8419 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:38:27Z INFO 8419 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff +2025-11-04T21:38:27Z INFO 8419 [NeffFileWriter]: IR signature: 78da4825f4fab9e53f7c6754bc62f5b3 for neff artifacts +2025-11-04T21:38:27Z USER 8419 [BackendPassManager]: neff_packager finished after 0.255 seconds +2025-11-04T21:38:27Z INFO 8419 [BackendPassManager]: curr_vmrss: 1116mb, ru_maxrss: 1237mb (delta=0mb) +2025-11-04T21:38:27Z INFO 8419 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25665 memory location(s), 2 block(s), and 114439 instruction(s). Max writers: 299 Max Readers: 8749 +2025-11-04T21:38:27Z INFO 8419 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000072 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.005867 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000107 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.006523 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000072 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.005867 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000069 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000069 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000069 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.005867 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:38:27Z INFO 8419 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=local (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_3 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.003906 MB │ +│ split_1 │ int32 │ 1 │ 0.003906 MB │ +│ split_2 │ float32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:27Z INFO 8419 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.656 │ float32 │ 1 │ 2.320312 MB │ +│ all_reduce.111 │ bfloat16 │ 1 │ 0.031250 MB │ +│ get_tuple_element.1 │ float32 │ 1 │ 0.007812 MB │ +│ get_tuple_element.2 │ uint32 │ 1 │ 0.007812 MB │ +│ all_reduce.112 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:27Z INFO 8419 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg00, addr_space=local (complete data located at nc01/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_3 │ float32 │ 1 │ 0.062500 MB │ +│ split_1 │ int32 │ 1 │ 0.003906 MB │ +│ split_2 │ float32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:38:27Z INFO 8419 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:38:28Z INFO 8316 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:38:28Z INFO 8316 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:38:28Z INFO 8316 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge +2025-11-04T21:38:28Z INFO 8316 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:38:28Z INFO 8316 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:38:28Z INFO 8316 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:38:28Z INFO 8316 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:38:28Z INFO 8316 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:38:28Z INFO 8316 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge/hlo_netlist.json +2025-11-04T21:38:28Z INFO 8316 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk0/neuronxcc-_glrdwge/hlo_netlist.json +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:38:28Z INFO 8316 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:38:28Z INFO 8316 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:38:28Z INFO 8303 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk0/metaneff.pb b/token_generation_model/_tp0_bk0/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..2079183b96f55350a14fa218189906ef7c44c735 --- /dev/null +++ b/token_generation_model/_tp0_bk0/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bdbae11a85941fbd43dc68da4566520f5e67c42d53bd7689c25e86efa9b5654 +size 3989548 diff --git a/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb b/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..36e92f146a676cd136e943841591acd3901e9625 --- /dev/null +++ b/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df7fe8b8c421486f1fc29a5055624b8dcf8503f5ac36202d78331b13d285ca45 +size 3967929 diff --git a/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff b/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff new file mode 100644 index 0000000000000000000000000000000000000000..59cda9c9d4dd4bd28eb422ad1576ededc4bb3f2b --- /dev/null +++ b/token_generation_model/_tp0_bk0/model.MODULE_caeca0352a0240106f96+d5490f71.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47fa87b2ad3d007fa8f4555b943efab99aeb4f481845eaaf73808db23aa72c4 +size 5970944 diff --git a/token_generation_model/_tp0_bk0/neuron_config.json b/token_generation_model/_tp0_bk0/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..3953852cc955c316371ace2771ee10a98774c18a --- /dev/null +++ b/token_generation_model/_tp0_bk0/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 8, + "bucket_n_active_tokens": false, + "buckets": [ + 128 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": [ + 128 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk0/wrapped_neff.hlo b/token_generation_model/_tp0_bk0/wrapped_neff.hlo new file mode 100644 index 0000000000000000000000000000000000000000..38c2528cbe2bc04263f1a299ffac004398e2a007 --- /dev/null +++ b/token_generation_model/_tp0_bk0/wrapped_neff.hlo @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea9d723cfb7b290aacb7549506f06d3a0b5bf4176ac09b5cfdb7e02ef53acfb +size 6166382 diff --git a/token_generation_model/_tp0_bk1/command.txt b/token_generation_model/_tp0_bk1/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..e471ea4b3c87326a80fad96f1b1bf6175542dfbd --- /dev/null +++ b/token_generation_model/_tp0_bk1/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb --output model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk1/compile_flags.MODULE_122f32d499d16ac150a0+bdebe6e1.json b/token_generation_model/_tp0_bk1/compile_flags.MODULE_122f32d499d16ac150a0+bdebe6e1.json new file mode 100644 index 0000000000000000000000000000000000000000..b5750ac338bc50989afa9d93ac6605f8bff7b9a0 --- /dev/null +++ b/token_generation_model/_tp0_bk1/compile_flags.MODULE_122f32d499d16ac150a0+bdebe6e1.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=2", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk1/global_metric_store.json b/token_generation_model/_tp0_bk1/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..2bdabcbf5eabfb2065e323efd0b096f1037ad9d8 --- /dev/null +++ b/token_generation_model/_tp0_bk1/global_metric_store.json @@ -0,0 +1,590 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 97.95317077636719, + "StaticProfiler::AveragePartitionUtilization": 92.12669372558594, + "StaticProfiler::AveragePeUtilization": 81.25751495361328, + "StaticProfiler::LocalizationEfficiency": 304.4672546386719, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 313.6503601074219, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 2.639462947845459, + "AffinePredicateResolution": 0.07714700698852539, + "AliasDependencyElimination": 0.0036094188690185547, + "AliasDependencyInduction": 0.7270452976226807, + "AliasDependencyReset": 0.7638492584228516, + "BFComputeCutting": 0.06763172149658203, + "BirCodeGenLoop": 1.8229739665985107, + "CCOpFusion": 0.6278514862060547, + "CanonicalizeConv": 6.0999998822808266e-05, + "CanonicalizeDAGForPGTiling": 0.17487192153930664, + "CanonicalizeForTensorizer": 0.0004180000105407089, + "CanonicalizeIR": 0.16120052337646484, + "Canonicalizer": 0.009514999575912952, + "CoalesceCCOp": 0.17880868911743164, + "CommuteConcat": 0.032053470611572266, + "DMALocalityOpt": 0.04187321662902832, + "DMAProfiler": 0.08473873138427734, + "DMATilingProfiler": 0.0929880142211914, + "DataLocalityOpt": 3.1744322776794434, + "DataStreaming": 0.16081619262695313, + "DeConcat": 0.054387569427490234, + "DeadCodeElimination": 0.0340876579284668, + "DeadStoreElimination": 1.1505050659179688, + "DelinearIndices": 0.4535233974456787, + "Delinearization": 0.17290353775024414, + "DelinearizeSPMD": 0.21816515922546387, + "DoNothing": 0.0006072521209716797, + "DramToDramTranspose": 0.33189892768859863, + "DumpGraphAndMetadata": 0.14194703102111816, + "EliminateDivs": 0.22325348854064941, + "ExpandBatchNorm": 0.10695385932922363, + "ExpandISAMacro": 0.09116101264953613, + "FactorizeBlkDims": 0.5053305625915527, + "FactorizeThreadAxesInFreeDims": 0.08290529251098633, + "FlattenMacroLoop": 0.11119699478149414, + "GenericAccessSimplifier": 0.0268251895904541, + "HoistCompute": 0.00010399999882793054, + "IdentifyCrossPassTensors": 0.00016999999934341758, + "InferInitValue": 1.3557538986206055, + "InferIntrinsicOnCC": 0.3400564193725586, + "InferNeuronTensor": 1.724083662033081, + "InferNonlocalTensors": 5.568960189819336, + "InferPSumTensor": 1.3407132625579834, + "InferShardAxis": 9.273904800415039, + "InferSharedMemLoc": 0.11475968360900879, + "InlineNativeKernels": 0.057065725326538086, + "InsertCoreBarrier": 0.12937331199645996, + "InsertIOTransposes": 0.901848554611206, + "InsertImplicitShardAxisBeforeISel": 0.42427659034729004, + "InsertLocalTransposes": 0.9839861392974854, + "InsertOffloadedTransposes": 0.12621712684631348, + "LICM": 0.14690518379211426, + "LateLegalizeInst": 0.1602187156677246, + "LateLegalizePostSplit": 0.10259485244750977, + "LateLowerReshapeOp": 0.04077744483947754, + "LateLowerTensorOp": 0.35434746742248535, + "LateNeuronInstComb": 1.1333520412445068, + "LayoutPreprocessing": 0.9390566349029541, + "LayoutPreprocessingAndAnalysis": 1.3896429538726807, + "LayoutRequirementAnalysis": 0.441650390625, + "LegalizeCCOpLayout": 0.14277291297912598, + "LegalizeOpLevelAlias": 0.04881477355957031, + "LegalizePartitionReduce": 0.04958534240722656, + "LegalizeSundaAccess": 0.9816975593566895, + "LegalizeSundaMacro": 0.6966722011566162, + "LegalizeType": 0.15005135536193848, + "LocalLayoutOpt": 0.6860671043395996, + "LoopFusion": 0.3846933841705322, + "LoopSplitting": 0.03160572052001953, + "LowerBroadcast": 0.0686800479888916, + "LowerCCOpBlockAxis": 0.2379469871520996, + "LowerComplexBroadcast": 0.07929420471191406, + "LowerIntrinsics": 1.3037331104278564, + "LowerShardAxis": 0.21744608879089355, + "LowerTensorOp": 0.6258466243743896, + "LowerToSendRecv": 0.1623542308807373, + "LowerTranspose": 0.5244274139404297, + "MacroGeneration": 2.6023354530334473, + "MaskPropagation": 0.11484622955322266, + "MemcastMotion": 0.0001340000017080456, + "MemcpyElimination": 9.5592622756958, + "MutateDataType": 0.040682315826416016, + "NeuronAliasDependencyInduction": 0.01969146728515625, + "NeuronAliasDependencyReset": 0.035384178161621094, + "NeuronInstComb": 0.381716251373291, + "NeuronLICM": 0.2757301330566406, + "NeuronLoopFusion": 1.5984740257263184, + "NeuronLoopInterchange": 0.06362342834472656, + "NeuronSimplifier": 0.49792051315307617, + "NeuronSimplifyPredicates": 0.2821841239929199, + "NeuronValueNumbering": 0.1226806640625, + "OptimizeAliasedCopyChain": 0.023923158645629883, + "OptimizeNKIKernels": 1.5258629322052002, + "PAGLayoutOpt": 15.819625854492188, + "PComputeCutting": 0.2921912670135498, + "PGLayoutTilingPipeline": 41.185035705566406, + "PGTiling": 5.950519561767578, + "PadElimination": 0.018389463424682617, + "ParAxesAnnotation": 14.825729370117188, + "PartialLoopFusion": 1.1320111751556396, + "PartialSimdFusion": 0.7182748317718506, + "PenguinizeFunctions": 0.00022699999681208283, + "PerfectLoopNest": 0.07256269454956055, + "PruneFunctions": 0.0004199999966658652, + "RecognizeOpIdiom": 0.12366747856140137, + "Recompute": 0.010107755661010742, + "RelaxPredicates": 0.11711525917053223, + "Rematerialization": 0.4015536308288574, + "RemoveOptimizationBarriers": 0.0001049999991664663, + "RemoveShardedPartitionAxes": 1.1850566864013672, + "ReshapeWeights": 0.032259225845336914, + "ResolveAccessConflict": 0.1985619068145752, + "ResolveComplicatePredicates": 0.06787538528442383, + "RewriteReplicationMatmul": 0.04492545127868652, + "RewriteWeights": 0.088104248046875, + "SFKVectorizer": 6.40352725982666, + "ScatterMotion": 0.005121999885886908, + "ShardingPropagationAnalysis": 0.752739667892456, + "SimpleAllReduceTiling": 0.07182931900024414, + "Simplifier": 0.10222339630126953, + "SimplifyMacroPredicates": 0.28852272033691406, + "SimplifyNeuronTensor": 0.3881356716156006, + "SimplifySlice": 0.0324554443359375, + "SimplifyTensor": 0.296234130859375, + "SpillPSum": 0.6311988830566406, + "SplitAPUnionSets": 0.4930405616760254, + "SplitAccGrp": 0.05073070526123047, + "StaticProfiler": 0.13188433647155762, + "StaticTransposeLocalTensor": 0.2777891159057617, + "SundaISel": 1.8359394073486328, + "TCTransform": 0.03323030471801758, + "TensorInitialization": 0.1669929027557373, + "TensorOpSimplifier": 0.6102476119995117, + "TensorOpTransform": 2.2951812744140625, + "TensorizerLegalizationPass": 0.00022400000307243317, + "TileCCOps": 0.2711188793182373, + "TilingProfiler": 0.4868581295013428, + "TransformConvOp": 0.1828169822692871, + "TritiumFusion": 1.0812408924102783, + "ValueNumbering": 0.09822511672973633, + "VectorizeDMA": 0.6892695426940918, + "VectorizeMatMult": 0.06119871139526367, + "VerifySupportedOps": 0.00039400000241585076, + "WeightCoalescing": 0.061014413833618164, + "ZeroSizeTensorElimination": 0.00042939186096191406, + "algsimp": 0.002426000079140067, + "batchnorm_expander": 0.004927999805659056, + "boundary-marker-removal": 0.0005169999785721302, + "call-inliner": 0.00046099998871795833, + "canonicalize-boundary-marker": 0.0006290000164881349, + "collective-stream-id-checker": 9.200000204145908e-05, + "comparison-expander": 0.0006270000012591481, + "computation-deduplicator": 0.0005849999724887311, + "config-lowering": 0.0004689999914262444, + "constant_folding": 0.00033599999733269215, + "cse": 0.000699999975040555, + "dce": 4.8999998398358e-05, + "dynamic-slice-transpose": 0.00023200000578071922, + "eliminate-redundant-compare": 0.0003110000106971711, + "emit-offloaded-dropout": 0.0004180000105407089, + "flatten-call-graph": 0.00046800001291558146, + "fuse-send-recv": 0.002374999923631549, + "hilo-conditional-to-select": 0.00015300000086426735, + "hilo::LegalizeAlias": 0.004660999868065119, + "hilo::NeuronInstCombine": 0.0014799999771639705, + "hilo::NeuronOpFusion": 0.0004180000105407089, + "hilo::ReplaceTokenTypeWithU8Pass": 0.0002460000105202198, + "hilo::ScheduleFusion": 4.099999932805076e-05, + "hilo::SixtyFourHack": 0.00042299999040551484, + "hilo::VerifyAliasing": 0.00014200000441633165, + "hlo-mac-count": 0.005288000218570232, + "io-con-pipe-begin": 1.1000000085914508e-05, + "io-con-pipe-end": 1.9999999949504854e-06, + "io-layout-normalization": 0.0015399999683722854, + "legalize-ccops-for-tensorizer": 2.5999999706982635e-05, + "legalize-compare": 0.0004839999892283231, + "lower-argminmax-custom-call": 0.0002640000020619482, + "map-inline": 0.0010479999473318458, + "metadata-naming": 0.0018269999418407679, + "mlir::detail::OpToOpPassAdaptor": 0.00019799999427050352, + "mlir::hlo::MhloToPyPenguin": 0.08273100107908249, + "mlir::mhlo::LowerComplexExtraPass": 0.004081000108271837, + "mlir::mhlo::LowerComplexPass": 0.002167999977245927, + "native-to-custom-softmax": 0.0005329999839887023, + "native-to-custom-softmax-dx": 0.000582000007852912, + "neuron-hlo-verifier": 0.027639999985694885, + "operand_upcaster": 0.0010930000571534038, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.10452699661254883, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0003980000037699938, + "reshape-mover": 0.00010900000052060932, + "simplify-concat": 0.0024399999529123306, + "simplify-while-loops": 8.600000001024455e-05, + "transform-variadic-reduce": 0.0009239999926649034, + "tuple-simplifier": 0.0003129999968223274, + "unpack-nested-aws-ntwsr": 0.0006539999740198255, + "unroll-while-loop": 1.5999999959603883e-05 + }, + "hilo": { + "HloMacCount": 7016251392.0, + "Traffic": 3915371264.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 88860, + "StaticProfiler::AifUb": 10.935946464538574, + "StaticProfiler::ArithmeticIntensityTensorizer": 33.29637145996094, + "StaticProfiler::AverageDmaLength": 4001.379150390625, + "StaticProfiler::DDRTransferBytes": 1901726676, + "StaticProfiler::InternalTransferBytes": 397203648, + "StaticProfiler::LoadExpanded": 327974, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 336323, + "StaticProfiler::TotalDynamicInstancesCount": 111012, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 93237, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 65953, + "TilingProfiler::NumPfTransposes": 350, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 200, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 16459, + "TilingProfiler::PfTransposeInstructionsForIo": 13090, + "TilingProfiler::PfTransposeInstructionsForLocal": 1407, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 117, + "TilingProfiler::SimdInstructionsAfterTiling": 2626, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 6.0999998822808266e-05, + "CanonicalizeForTensorizer": 0.0004180000105407089, + "Canonicalizer": 0.009514999575912952, + "HoistCompute": 0.00010399999882793054, + "IdentifyCrossPassTensors": 0.00016999999934341758, + "MemcastMotion": 0.0001340000017080456, + "PenguinizeFunctions": 0.00022699999681208283, + "PruneFunctions": 0.0004199999966658652, + "RemoveOptimizationBarriers": 0.0001049999991664663, + "ScatterMotion": 0.005121999885886908, + "TensorizerLegalizationPass": 0.00022400000307243317, + "VerifySupportedOps": 0.00039400000241585076, + "algsimp": 0.002426000079140067, + "batchnorm_expander": 0.004927999805659056, + "boundary-marker-removal": 0.0005169999785721302, + "call-inliner": 0.00046099998871795833, + "canonicalize-boundary-marker": 0.0006290000164881349, + "collective-stream-id-checker": 9.200000204145908e-05, + "comparison-expander": 0.0006270000012591481, + "computation-deduplicator": 0.0005849999724887311, + "config-lowering": 0.0004689999914262444, + "constant_folding": 0.00033599999733269215, + "cse": 0.000699999975040555, + "dce": 4.8999998398358e-05, + "dynamic-slice-transpose": 0.00023200000578071922, + "eliminate-redundant-compare": 0.0003110000106971711, + "emit-offloaded-dropout": 0.0004180000105407089, + "flatten-call-graph": 0.00046800001291558146, + "fuse-send-recv": 0.002374999923631549, + "hilo-conditional-to-select": 0.00015300000086426735, + "hilo::LegalizeAlias": 0.004660999868065119, + "hilo::NeuronInstCombine": 0.0014799999771639705, + "hilo::NeuronOpFusion": 0.0004180000105407089, + "hilo::ReplaceTokenTypeWithU8Pass": 0.0002460000105202198, + "hilo::ScheduleFusion": 4.099999932805076e-05, + "hilo::SixtyFourHack": 0.00042299999040551484, + "hilo::VerifyAliasing": 0.00014200000441633165, + "hlo-mac-count": 0.005288000218570232, + "io-con-pipe-begin": 1.1000000085914508e-05, + "io-con-pipe-end": 1.9999999949504854e-06, + "io-layout-normalization": 0.0015399999683722854, + "legalize-ccops-for-tensorizer": 2.5999999706982635e-05, + "legalize-compare": 0.0004839999892283231, + "lower-argminmax-custom-call": 0.0002640000020619482, + "map-inline": 0.0010479999473318458, + "metadata-naming": 0.0018269999418407679, + "mlir::detail::OpToOpPassAdaptor": 0.00019799999427050352, + "mlir::hlo::MhloToPyPenguin": 0.08273100107908249, + "mlir::mhlo::LowerComplexExtraPass": 0.004081000108271837, + "mlir::mhlo::LowerComplexPass": 0.002167999977245927, + "native-to-custom-softmax": 0.0005329999839887023, + "native-to-custom-softmax-dx": 0.000582000007852912, + "neuron-hlo-verifier": 0.027639999985694885, + "operand_upcaster": 0.0010930000571534038, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.10452699661254883, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0003980000037699938, + "reshape-mover": 0.00010900000052060932, + "simplify-concat": 0.0024399999529123306, + "simplify-while-loops": 8.600000001024455e-05, + "transform-variadic-reduce": 0.0009239999926649034, + "tuple-simplifier": 0.0003129999968223274, + "unpack-nested-aws-ntwsr": 0.0006539999740198255, + "unroll-while-loop": 1.5999999959603883e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0002186298370361328, + "DMALocalityOpt": 0.00017881393432617188, + "DMAProfiler": 0.0007708072662353516, + "DataStreaming": 0.0002703666687011719, + "DoNothing": 0.00036787986755371094, + "ExpandISAMacro": 0.0018739700317382813, + "FactorizeBlkDims": 0.0004401206970214844, + "InferPSumTensor": 0.0006031990051269531, + "InferSharedMemLoc": 0.00032973289489746094, + "InsertCoreBarrier": 0.0002486705780029297, + "LateLegalizeInst": 0.00042176246643066406, + "LateNeuronInstComb": 0.0006310939788818359, + "LegalizeSundaAccess": 0.0015883445739746094, + "LegalizeType": 0.0002715587615966797, + "LowerBroadcast": 0.0002288818359375, + "LowerIntrinsics": 0.00025343894958496094, + "LowerTranspose": 0.0002288818359375, + "NeuronInstComb": 0.0006537437438964844, + "NeuronLICM": 0.00035953521728515625, + "NeuronSimplifyPredicates": 0.0023772716522216797, + "NeuronValueNumbering": 0.0004177093505859375, + "SFKVectorizer": 0.0029053688049316406, + "SimpleAllReduceTiling": 0.00021219253540039063, + "SimplifyNeuronTensor": 0.0005879402160644531, + "SpillPSum": 0.0005121231079101563, + "WeightCoalescing": 0.0002167224884033203 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 3.5839521884918213, + "HloMacCount": 7016251392.0, + "Traffic": 3915371264.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 2.639462947845459, + "AffinePredicateResolution": 0.07714700698852539, + "AliasDependencyElimination": 0.0036094188690185547, + "AliasDependencyInduction": 0.7270452976226807, + "AliasDependencyReset": 0.7638492584228516, + "BFComputeCutting": 0.06763172149658203, + "BirCodeGenLoop": 1.8229739665985107, + "CCOpFusion": 0.6278514862060547, + "CanonicalizeDAGForPGTiling": 0.17487192153930664, + "CanonicalizeIR": 0.16120052337646484, + "CoalesceCCOp": 0.17418503761291504, + "CommuteConcat": 0.032053470611572266, + "DMALocalityOpt": 0.03976774215698242, + "DMAProfiler": 0.0791785717010498, + "DMATilingProfiler": 0.0929880142211914, + "DataLocalityOpt": 3.1744322776794434, + "DataStreaming": 0.1563587188720703, + "DeConcat": 0.054387569427490234, + "DeadCodeElimination": 0.0340876579284668, + "DeadStoreElimination": 1.1505050659179688, + "DelinearIndices": 0.4535233974456787, + "Delinearization": 0.17290353775024414, + "DelinearizeSPMD": 0.21816515922546387, + "DoNothing": 6.508827209472656e-05, + "DramToDramTranspose": 0.33189892768859863, + "DumpGraphAndMetadata": 0.14194703102111816, + "EliminateDivs": 0.22325348854064941, + "ExpandBatchNorm": 0.10695385932922363, + "ExpandISAMacro": 0.08657169342041016, + "FactorizeBlkDims": 0.4973928928375244, + "FactorizeThreadAxesInFreeDims": 0.08290529251098633, + "FlattenMacroLoop": 0.11119699478149414, + "GenericAccessSimplifier": 0.0268251895904541, + "InferInitValue": 1.3557538986206055, + "InferIntrinsicOnCC": 0.3400564193725586, + "InferNeuronTensor": 1.724083662033081, + "InferNonlocalTensors": 5.568960189819336, + "InferPSumTensor": 1.3334856033325195, + "InferShardAxis": 9.273904800415039, + "InferSharedMemLoc": 0.10996842384338379, + "InlineNativeKernels": 0.057065725326538086, + "InsertCoreBarrier": 0.12514686584472656, + "InsertIOTransposes": 0.901848554611206, + "InsertImplicitShardAxisBeforeISel": 0.42427659034729004, + "InsertLocalTransposes": 0.9839861392974854, + "InsertOffloadedTransposes": 0.12621712684631348, + "LICM": 0.14690518379211426, + "LateLegalizeInst": 0.15152931213378906, + "LateLegalizePostSplit": 0.10259485244750977, + "LateLowerReshapeOp": 0.04077744483947754, + "LateLowerTensorOp": 0.35434746742248535, + "LateNeuronInstComb": 1.1267306804656982, + "LayoutPreprocessing": 0.9390566349029541, + "LayoutPreprocessingAndAnalysis": 1.3896429538726807, + "LayoutRequirementAnalysis": 0.441650390625, + "LegalizeCCOpLayout": 0.14277291297912598, + "LegalizeOpLevelAlias": 0.04881477355957031, + "LegalizePartitionReduce": 0.04958534240722656, + "LegalizeSundaAccess": 0.9715769290924072, + "LegalizeSundaMacro": 0.6966722011566162, + "LegalizeType": 0.14412307739257813, + "LocalLayoutOpt": 0.6860671043395996, + "LoopFusion": 0.3846933841705322, + "LoopSplitting": 0.03160572052001953, + "LowerBroadcast": 0.06609702110290527, + "LowerCCOpBlockAxis": 0.2379469871520996, + "LowerComplexBroadcast": 0.07929420471191406, + "LowerIntrinsics": 1.3009629249572754, + "LowerShardAxis": 0.21744608879089355, + "LowerTensorOp": 0.6258466243743896, + "LowerToSendRecv": 0.1623542308807373, + "LowerTranspose": 0.521834135055542, + "MacroGeneration": 2.6023354530334473, + "MaskPropagation": 0.11484622955322266, + "MemcpyElimination": 9.5592622756958, + "MutateDataType": 0.040682315826416016, + "NeuronAliasDependencyInduction": 0.01969146728515625, + "NeuronAliasDependencyReset": 0.035384178161621094, + "NeuronInstComb": 0.37508440017700195, + "NeuronLICM": 0.26921892166137695, + "NeuronLoopFusion": 1.5984740257263184, + "NeuronLoopInterchange": 0.06362342834472656, + "NeuronSimplifier": 0.49792051315307617, + "NeuronSimplifyPredicates": 0.277324914932251, + "NeuronValueNumbering": 0.11949491500854492, + "OptimizeAliasedCopyChain": 0.023923158645629883, + "OptimizeNKIKernels": 1.5258629322052002, + "PAGLayoutOpt": 15.819625854492188, + "PComputeCutting": 0.2921912670135498, + "PGLayoutTilingPipeline": 41.185035705566406, + "PGTiling": 5.950519561767578, + "PadElimination": 0.018389463424682617, + "ParAxesAnnotation": 14.825729370117188, + "PartialLoopFusion": 1.1320111751556396, + "PartialSimdFusion": 0.7182748317718506, + "PerfectLoopNest": 0.07256269454956055, + "RecognizeOpIdiom": 0.12366747856140137, + "Recompute": 0.010107755661010742, + "RelaxPredicates": 0.11711525917053223, + "Rematerialization": 0.4015536308288574, + "RemoveShardedPartitionAxes": 1.1850566864013672, + "ReshapeWeights": 0.032259225845336914, + "ResolveAccessConflict": 0.1985619068145752, + "ResolveComplicatePredicates": 0.06787538528442383, + "RewriteReplicationMatmul": 0.04492545127868652, + "RewriteWeights": 0.088104248046875, + "SFKVectorizer": 6.364406108856201, + "ShardingPropagationAnalysis": 0.752739667892456, + "SimpleAllReduceTiling": 0.06724977493286133, + "Simplifier": 0.10222339630126953, + "SimplifyMacroPredicates": 0.28852272033691406, + "SimplifyNeuronTensor": 0.33968162536621094, + "SimplifySlice": 0.0324554443359375, + "SimplifyTensor": 0.296234130859375, + "SpillPSum": 0.6185405254364014, + "SplitAPUnionSets": 0.4930405616760254, + "SplitAccGrp": 0.05073070526123047, + "StaticProfiler": 0.13188433647155762, + "StaticTransposeLocalTensor": 0.2777891159057617, + "SundaISel": 1.8359394073486328, + "TCTransform": 0.03323030471801758, + "TensorInitialization": 0.1669929027557373, + "TensorOpSimplifier": 0.6102476119995117, + "TensorOpTransform": 2.2951812744140625, + "TileCCOps": 0.2711188793182373, + "TilingProfiler": 0.4868581295013428, + "TransformConvOp": 0.1828169822692871, + "TritiumFusion": 1.0812408924102783, + "ValueNumbering": 0.09822511672973633, + "VectorizeDMA": 0.6892695426940918, + "VectorizeMatMult": 0.06119871139526367, + "WeightCoalescing": 0.058365583419799805, + "ZeroSizeTensorElimination": 0.00042939186096191406 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 88860, + "StaticProfiler::AifUb": 10.935946464538574, + "StaticProfiler::ArithmeticIntensityTensorizer": 33.29637145996094, + "StaticProfiler::AverageDmaLength": 4001.379150390625, + "StaticProfiler::AverageFractalPeUtilization": 97.95317077636719, + "StaticProfiler::AveragePartitionUtilization": 92.12669372558594, + "StaticProfiler::AveragePeUtilization": 81.25751495361328, + "StaticProfiler::DDRTransferBytes": 1901726676, + "StaticProfiler::InternalTransferBytes": 397203648, + "StaticProfiler::LoadExpanded": 327974, + "StaticProfiler::LocalizationEfficiency": 304.4672546386719, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 313.6503601074219, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 336323, + "StaticProfiler::TotalDynamicInstancesCount": 111012, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 93237, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 65953, + "TilingProfiler::NumPfTransposes": 350, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 200, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 16459, + "TilingProfiler::PfTransposeInstructionsForIo": 13090, + "TilingProfiler::PfTransposeInstructionsForLocal": 1407, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 117, + "TilingProfiler::SimdInstructionsAfterTiling": 2626, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.004405021667480469, + "DMALocalityOpt": 0.0019266605377197266, + "DMAProfiler": 0.0047893524169921875, + "DataStreaming": 0.004187107086181641, + "DoNothing": 0.0001742839813232422, + "ExpandISAMacro": 0.0027153491973876953, + "FactorizeBlkDims": 0.007497549057006836, + "InferPSumTensor": 0.006624460220336914, + "InferSharedMemLoc": 0.004461526870727539, + "InsertCoreBarrier": 0.003977775573730469, + "LateLegalizeInst": 0.008267641067504883, + "LateNeuronInstComb": 0.005990266799926758, + "LegalizeSundaAccess": 0.008532285690307617, + "LegalizeType": 0.005656719207763672, + "LowerBroadcast": 0.002354145050048828, + "LowerIntrinsics": 0.0025167465209960938, + "LowerTranspose": 0.0023643970489501953, + "NeuronInstComb": 0.005978107452392578, + "NeuronLICM": 0.006151676177978516, + "NeuronSimplifyPredicates": 0.0024819374084472656, + "NeuronValueNumbering": 0.0027680397033691406, + "SFKVectorizer": 0.036216020584106445, + "SimpleAllReduceTiling": 0.004367351531982422, + "SimplifyNeuronTensor": 0.047866106033325195, + "SpillPSum": 0.012146234512329102, + "WeightCoalescing": 0.002432107925415039 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk1/graph.neff b/token_generation_model/_tp0_bk1/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..96245a45162e26d4f994ef20644d1d1eea4b346f --- /dev/null +++ b/token_generation_model/_tp0_bk1/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73cd1b99047c46f005f119317e5cec46b3ade159908bfc6f17487c1550536355 +size 6093824 diff --git a/token_generation_model/_tp0_bk1/log-neuron-cc.txt b/token_generation_model/_tp0_bk1/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..64d7ddeb8810e89c76cec532f736306bd1e2d264 --- /dev/null +++ b/token_generation_model/_tp0_bk1/log-neuron-cc.txt @@ -0,0 +1,4593 @@ +2025-11-04T21:38:36Z INFO 8792 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:36Z INFO 8792 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:36Z INFO 8853 [root]: XLA detected +2025-11-04T21:38:36Z INFO 8853 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:36Z INFO 8853 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1 +2025-11-04T21:38:36Z INFO 8853 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:36Z INFO 8853 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8853 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:36Z INFO 8853 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:36Z INFO 8853 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:36Z INFO 8853 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8853 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:37Z INFO 8853 [job.HLOToTensorizer.0]: Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate parameter reduce reshape rng scatter select sine slice subtract transpose tuple +2025-11-04 21:38:36.991859: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:88] Could not open file debug_info_hlo_partitions.json +2025-11-04 21:38:37.002725: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.10701 = tuple(%reshape.4385, %scatter.9929, %scatter.9944, %scatter.9957, %scatter.9972, %scatter.9985, %scatter.10000, %scatter.10013, %scatter.10028, %scatter.10041, %scatter.10056, %scatter.10069, %scatter.10084, %scatter.10097, %scatter.10112, %scatter.10125, %scatter.10140, %scatter.10153, %scatter.10168, %scatter.10181, %scatter.10196, %scatter.10209, %scatter.10224, %scatter.10237, %scatter.10252, %scatter.10265, %scatter.10280, %scatter.10293, %scatter.10308, %scatter.10321, %scatter.10336... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:37Z INFO 8853 [job.HLOToTensorizer.0]: IR signature: 9970e76685d90d06ef40c9cf8874e0d68e84099fb92abf553ddc7831d8f2c638 for sg0000/HLOToTensorizer +2025-11-04T21:38:37Z INFO 8853 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:37Z INFO 8853 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:37Z INFO 8853 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:37Z INFO 8853 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:37Z INFO 8853 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:37Z INFO 8853 [job.Frontend.0]: Start model loading +2025-11-04T21:38:37Z INFO 8853 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:37Z INFO 8853 [job.Frontend.0]: Num jobs: 1 +2025-11-04T21:38:37Z USER 8853 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:37Z INFO 8853 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-11-04T21:38:37Z INFO 8853 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-11-04T21:38:39Z INFO 8853 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.049 seconds +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.024 seconds +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.158 seconds +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.183 seconds +2025-11-04T21:38:39Z INFO 8853 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:40Z INFO 8853 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8853 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.626 seconds +2025-11-04T21:38:40Z INFO 8853 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:40Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.008 seconds +2025-11-04T21:38:40Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.505 seconds +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.546 seconds +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.143 seconds +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.610 seconds +2025-11-04T21:38:41Z INFO 8853 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.161 seconds +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.068 seconds +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.077 seconds +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.223 seconds +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.067 seconds +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.335 seconds +2025-11-04T21:38:42Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.310 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.213 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.859 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.078 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.086 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.079 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.080 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.107 seconds +2025-11-04T21:38:43Z INFO 8853 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:44Z INFO 8853 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8853 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.072 seconds +2025-11-04T21:38:44Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:44Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.065 seconds +2025-11-04T21:38:44Z INFO 8853 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:44Z INFO 8853 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 1.957 seconds +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.337 seconds +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 2.295 seconds +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.354 seconds +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.012 seconds +2025-11-04T21:38:46Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:47Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8853 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.727 seconds +2025-11-04T21:38:47Z INFO 8853 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.764 seconds +2025-11-04T21:38:47Z INFO 8853 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:47Z INFO 8853 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:56Z INFO 8853 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 9.258 seconds +2025-11-04T21:38:56Z INFO 8853 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:57Z INFO 8853 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.299 seconds +2025-11-04T21:38:57Z INFO 8853 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:57Z INFO 8853 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 9.559 seconds +2025-11-04T21:38:57Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:57Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:58Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.921 seconds +2025-11-04T21:38:58Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:58Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.444 seconds +2025-11-04T21:38:58Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:58Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.332 seconds +2025-11-04T21:38:58Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:59Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.613 seconds +2025-11-04T21:38:59Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.530 seconds +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.859 seconds +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.402 seconds +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.168 seconds +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.159 seconds +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.149 seconds +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.477 seconds +2025-11-04T21:39:00Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:01Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:39:01Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.631 seconds +2025-11-04T21:39:01Z INFO 8853 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.686 seconds +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.119 seconds +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.121 seconds +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LICM]: LICM finished after 0.073 seconds +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.163 seconds +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.201 seconds +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.118 seconds +2025-11-04T21:39:03Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.235 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.562 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.032 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LICM]: LICM finished after 0.060 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.125 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.106 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.232 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.126 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LICM]: LICM finished after 0.057 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.018 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.145 seconds +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:04Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.185 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.193 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.385 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.031 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.104 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.104 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/LICM]: LICM finished after 0.054 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.098 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.033 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.032 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.032 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.123 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.124 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.096 seconds +2025-11-04T21:39:05Z INFO 8853 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.151 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.010 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.033 seconds +2025-11-04T21:39:07Z INFO 8853 [Tensorizer]: After optimization: 958 statements +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.041 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.027 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.102 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.102 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=32768 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 8) %'all_gather.1' = AllGatherOp-402 AllGather_add(bfloat16 (1024, 8) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.50 | hlo_id: 50 | , id = 402 +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: in float32 (512, 8) %'all_gather.2' = AllGatherOp-9247 AllGather_add(float32 (256, 8) %'transpose.537', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9754 | hlo_id: 9754 | , id = 9247 +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=16384 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: in uint32 (512, 8) %'all_gather.3' = AllGatherOp-9263 AllGather_add(uint32 (256, 8) %'transpose.538', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9893 | hlo_id: 9893 | , id = 9263 +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.271 seconds +2025-11-04T21:39:07Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.544 seconds +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.135 seconds +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.334 seconds +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.034 seconds +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.034 seconds +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.041 seconds +2025-11-04T21:39:08Z INFO 8853 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.340 seconds +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_1 +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_1 finished after 0.031 seconds +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.199 seconds +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/LICM]: LICM finished after 0.060 seconds +2025-11-04T21:39:09Z INFO 8853 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.686 seconds +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.362 seconds +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.244 seconds +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:39:10Z INFO 8853 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:39:11Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:11Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:11Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.135 seconds +2025-11-04T21:39:11Z INFO 8853 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:39:11Z INFO 8853 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.939 seconds +2025-11-04T21:39:11Z INFO 8853 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:39:12Z INFO 8853 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.442 seconds +2025-11-04T21:39:12Z INFO 8853 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.390 seconds +2025-11-04T21:39:12Z INFO 8853 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:39:12Z INFO 8853 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:13Z INFO 8853 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:17Z INFO 8853 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:39:17Z INFO 8853 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 5.569 seconds +2025-11-04T21:39:17Z INFO 8853 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:39:17Z INFO 8853 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:39:18Z INFO 8853 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:39:32Z INFO 8853 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:39:32Z INFO 8853 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 14.826 seconds +2025-11-04T21:39:32Z INFO 8853 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.984 seconds +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 15.820 seconds +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.173 seconds +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.218 seconds +2025-11-04T21:39:33Z INFO 8853 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:39:34Z INFO 8853 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.753 seconds +2025-11-04T21:39:34Z INFO 8853 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:39:41Z INFO 8853 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:39:41Z INFO 8853 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 1215 +total number of sharded dags: 436 + +total bytes transferred from input, output, non local tensors: 1844922696 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 1842290820 +% bytes transferred with 2x bandwidths: 99.86 + +NC0 FLOPs: 7011502233 +NC1 FLOPs: 7004162944 +% FLOPs sharded: 99.94 + + +Shard dim: 256, Number of dags: 235 +Matmuls sharded with this dim: +[2,2,64] @ [2,64,256(s)] = [2,256(s)] Number of occurrences: 28 +[2,256(s)] @ [256(s),128] = [2,128] Number of occurrences: 28 + + +Shard dim: 2, Number of dags: 196 +Matmuls sharded with this dim: +[8,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [8,8,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,128] = [8,2,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,2,64] = [8,2,2,2,2,64] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,64] = [8,2,2,2,64] Number of occurrences: 28 +[8,2,2,2,128] @ [2,2,2,128,2(s),2,4,128] = [8,2(s),2,4,128] Number of occurrences: 28 +[8,2,8,128] @ [2,8,128,2(s),6,2,128] = [8,2(s),6,2,128] Number of occurrences: 56 + + +Shard dim: 8, Number of dags: 2 +Matmuls sharded with this dim: + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 512, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[8,2,8,128] @ [2,8,128,75968(s)] = [8,75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:39:42Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:42Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:42Z INFO 8853 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.454 seconds +2025-11-04T21:39:42Z INFO 8853 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 1.185 seconds +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 9.274 seconds +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.115 seconds +2025-11-04T21:39:43Z INFO 8853 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:39:44Z INFO 8853 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:39:44Z INFO 8853 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.175 seconds +2025-11-04T21:39:44Z INFO 8853 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:44Z INFO 8853 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:44Z INFO 8853 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.238 seconds +2025-11-04T21:39:44Z INFO 8853 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:39:44Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (79, 'AG3792'), (80, 'AG3791'), (218, 'AG3783'), (474, 'AG3782'), (274, 'AG3789')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9810 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (83, 'AG3808'), (84, 'AG3807'), (218, 'AG3783'), (474, 'AG3782'), (272, 'AG3805')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10061 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (87, 'AG3824'), (88, 'AG3823'), (218, 'AG3783'), (474, 'AG3782'), (270, 'AG3821')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10312 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (91, 'AG3840'), (92, 'AG3839'), (218, 'AG3783'), (474, 'AG3782'), (268, 'AG3837')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (95, 'AG3856'), (96, 'AG3855'), (218, 'AG3783'), (474, 'AG3782'), (266, 'AG3853')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10814 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (99, 'AG3872'), (100, 'AG3871'), (218, 'AG3783'), (474, 'AG3782'), (264, 'AG3869')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11065 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (103, 'AG3888'), (104, 'AG3887'), (218, 'AG3783'), (474, 'AG3782'), (262, 'AG3885')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11316 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (107, 'AG3904'), (108, 'AG3903'), (218, 'AG3783'), (474, 'AG3782'), (260, 'AG3901')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (111, 'AG3920'), (112, 'AG3919'), (218, 'AG3783'), (474, 'AG3782'), (258, 'AG3917')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11818 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (115, 'AG3936'), (116, 'AG3935'), (218, 'AG3783'), (474, 'AG3782'), (256, 'AG3933')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12069 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (119, 'AG3952'), (120, 'AG3951'), (218, 'AG3783'), (474, 'AG3782'), (254, 'AG3949')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12320 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (123, 'AG3968'), (124, 'AG3967'), (218, 'AG3783'), (474, 'AG3782'), (252, 'AG3965')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12571 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (127, 'AG3984'), (128, 'AG3983'), (218, 'AG3783'), (474, 'AG3782'), (250, 'AG3981')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12822 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (131, 'AG4000'), (132, 'AG3999'), (218, 'AG3783'), (474, 'AG3782'), (248, 'AG3997')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13073 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (135, 'AG4016'), (136, 'AG4015'), (218, 'AG3783'), (474, 'AG3782'), (246, 'AG4013')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13324 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (139, 'AG4032'), (140, 'AG4031'), (218, 'AG3783'), (474, 'AG3782'), (244, 'AG4029')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13575 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (143, 'AG4048'), (144, 'AG4047'), (218, 'AG3783'), (474, 'AG3782'), (242, 'AG4045')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13826 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (147, 'AG4064'), (148, 'AG4063'), (218, 'AG3783'), (474, 'AG3782'), (240, 'AG4061')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14077 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (151, 'AG4080'), (152, 'AG4079'), (218, 'AG3783'), (474, 'AG3782'), (238, 'AG4077')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14328 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (155, 'AG4096'), (156, 'AG4095'), (218, 'AG3783'), (474, 'AG3782'), (236, 'AG4093')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (159, 'AG4112'), (160, 'AG4111'), (218, 'AG3783'), (474, 'AG3782'), (234, 'AG4109')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14830 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (163, 'AG4128'), (164, 'AG4127'), (218, 'AG3783'), (474, 'AG3782'), (232, 'AG4125')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15081 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (167, 'AG4144'), (168, 'AG4143'), (218, 'AG3783'), (474, 'AG3782'), (230, 'AG4141')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15332 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (171, 'AG4160'), (172, 'AG4159'), (218, 'AG3783'), (474, 'AG3782'), (228, 'AG4157')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (175, 'AG4176'), (176, 'AG4175'), (218, 'AG3783'), (474, 'AG3782'), (226, 'AG4173')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15834 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (179, 'AG4192'), (180, 'AG4191'), (218, 'AG3783'), (474, 'AG3782'), (224, 'AG4189')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16085 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (183, 'AG4208'), (184, 'AG4207'), (218, 'AG3783'), (474, 'AG3782'), (222, 'AG4205')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16336 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(8, 2, 2, 16, 2, 128, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3784'), (187, 'AG4224'), (188, 'AG4223'), (218, 'AG3783'), (474, 'AG3782'), (220, 'AG4221')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23397 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(78, 'AG3797'), (273, 'AG3796'), (79, 'AG3792'), (80, 'AG3791'), (81, 'AG3790'), (358, 'AG3795'), (470, 'AG3794')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (79, 'AG3792'), (80, 'AG3791'), (471, 'AG3793')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23387 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (79, 'AG3792'), (80, 'AG3791'), (274, 'AG3789'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(79, 'AG3792'), (191, 'AG3787'), (80, 'AG3791'), (81, 'AG3790'), (274, 'AG3789'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(469, 'AG3798'), (74, 'AG3800'), (357, 'AG3799')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(37, 'AG3804'), (1, 'AG3801'), (356, 'AG3803'), (468, 'AG3802')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23400 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23398 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23399 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23412 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(82, 'AG3813'), (271, 'AG3812'), (83, 'AG3808'), (84, 'AG3807'), (85, 'AG3806'), (355, 'AG3811'), (466, 'AG3810')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (83, 'AG3808'), (84, 'AG3807'), (467, 'AG3809')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23402 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (83, 'AG3808'), (84, 'AG3807'), (272, 'AG3805'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(83, 'AG3808'), (191, 'AG3787'), (84, 'AG3807'), (85, 'AG3806'), (272, 'AG3805'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(465, 'AG3814'), (75, 'AG3816'), (354, 'AG3815')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(38, 'AG3820'), (2, 'AG3817'), (353, 'AG3819'), (464, 'AG3818')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23415 of IO tensor {'CrossPassTensor': ''}bfloat16 %input80|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23413 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23414 of IO tensor {'CrossPassTensor': ''}bfloat16 %input82|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23427 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(86, 'AG3829'), (269, 'AG3828'), (87, 'AG3824'), (88, 'AG3823'), (89, 'AG3822'), (352, 'AG3827'), (462, 'AG3826')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (87, 'AG3824'), (88, 'AG3823'), (463, 'AG3825')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23417 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (87, 'AG3824'), (88, 'AG3823'), (270, 'AG3821'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(87, 'AG3824'), (191, 'AG3787'), (88, 'AG3823'), (89, 'AG3822'), (270, 'AG3821'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(461, 'AG3830'), (76, 'AG3832'), (351, 'AG3831')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(39, 'AG3836'), (3, 'AG3833'), (350, 'AG3835'), (460, 'AG3834')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23430 of IO tensor {'CrossPassTensor': ''}bfloat16 %input91|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23428 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23429 of IO tensor {'CrossPassTensor': ''}bfloat16 %input93|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(90, 'AG3845'), (267, 'AG3844'), (91, 'AG3840'), (92, 'AG3839'), (93, 'AG3838'), (349, 'AG3843'), (458, 'AG3842')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (91, 'AG3840'), (92, 'AG3839'), (459, 'AG3841')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23432 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (91, 'AG3840'), (92, 'AG3839'), (268, 'AG3837'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(91, 'AG3840'), (191, 'AG3787'), (92, 'AG3839'), (93, 'AG3838'), (268, 'AG3837'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(457, 'AG3846'), (192, 'AG3848'), (348, 'AG3847')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(40, 'AG3852'), (4, 'AG3849'), (347, 'AG3851'), (456, 'AG3850')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3849'), (192, 'AG3848'), (191, 'AG3787'), (348, 'AG3847'), (457, 'AG3846')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3849'), (192, 'AG3848'), (191, 'AG3787'), (348, 'AG3847'), (457, 'AG3846')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23457 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(94, 'AG3861'), (265, 'AG3860'), (95, 'AG3856'), (96, 'AG3855'), (97, 'AG3854'), (346, 'AG3859'), (454, 'AG3858')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (95, 'AG3856'), (96, 'AG3855'), (455, 'AG3857')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (95, 'AG3856'), (96, 'AG3855'), (266, 'AG3853'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(95, 'AG3856'), (191, 'AG3787'), (96, 'AG3855'), (97, 'AG3854'), (266, 'AG3853'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(453, 'AG3862'), (193, 'AG3864'), (345, 'AG3863')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(41, 'AG3868'), (5, 'AG3865'), (344, 'AG3867'), (452, 'AG3866')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3865'), (193, 'AG3864'), (191, 'AG3787'), (345, 'AG3863'), (453, 'AG3862')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3865'), (193, 'AG3864'), (191, 'AG3787'), (345, 'AG3863'), (453, 'AG3862')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23472 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(98, 'AG3877'), (263, 'AG3876'), (99, 'AG3872'), (100, 'AG3871'), (101, 'AG3870'), (343, 'AG3875'), (450, 'AG3874')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (99, 'AG3872'), (100, 'AG3871'), (451, 'AG3873')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23462 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (99, 'AG3872'), (100, 'AG3871'), (264, 'AG3869'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(99, 'AG3872'), (191, 'AG3787'), (100, 'AG3871'), (101, 'AG3870'), (264, 'AG3869'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(449, 'AG3878'), (194, 'AG3880'), (342, 'AG3879')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(42, 'AG3884'), (6, 'AG3881'), (341, 'AG3883'), (448, 'AG3882')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3881'), (194, 'AG3880'), (191, 'AG3787'), (342, 'AG3879'), (449, 'AG3878')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23473 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3881'), (194, 'AG3880'), (191, 'AG3787'), (342, 'AG3879'), (449, 'AG3878')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23487 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(102, 'AG3893'), (261, 'AG3892'), (103, 'AG3888'), (104, 'AG3887'), (105, 'AG3886'), (340, 'AG3891'), (446, 'AG3890')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (103, 'AG3888'), (104, 'AG3887'), (447, 'AG3889')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23477 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (103, 'AG3888'), (104, 'AG3887'), (262, 'AG3885'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(103, 'AG3888'), (191, 'AG3787'), (104, 'AG3887'), (105, 'AG3886'), (262, 'AG3885'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(445, 'AG3894'), (195, 'AG3896'), (339, 'AG3895')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(43, 'AG3900'), (7, 'AG3897'), (338, 'AG3899'), (444, 'AG3898')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3897'), (195, 'AG3896'), (191, 'AG3787'), (339, 'AG3895'), (445, 'AG3894')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23488 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3897'), (195, 'AG3896'), (191, 'AG3787'), (339, 'AG3895'), (445, 'AG3894')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23502 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(106, 'AG3909'), (259, 'AG3908'), (107, 'AG3904'), (108, 'AG3903'), (109, 'AG3902'), (337, 'AG3907'), (442, 'AG3906')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (107, 'AG3904'), (108, 'AG3903'), (443, 'AG3905')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23492 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (107, 'AG3904'), (108, 'AG3903'), (260, 'AG3901'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(107, 'AG3904'), (191, 'AG3787'), (108, 'AG3903'), (109, 'AG3902'), (260, 'AG3901'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(441, 'AG3910'), (196, 'AG3912'), (336, 'AG3911')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(44, 'AG3916'), (8, 'AG3913'), (335, 'AG3915'), (440, 'AG3914')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3913'), (196, 'AG3912'), (191, 'AG3787'), (336, 'AG3911'), (441, 'AG3910')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23503 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3913'), (196, 'AG3912'), (191, 'AG3787'), (336, 'AG3911'), (441, 'AG3910')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23517 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(110, 'AG3925'), (257, 'AG3924'), (111, 'AG3920'), (112, 'AG3919'), (113, 'AG3918'), (334, 'AG3923'), (438, 'AG3922')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (111, 'AG3920'), (112, 'AG3919'), (439, 'AG3921')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23507 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (111, 'AG3920'), (112, 'AG3919'), (258, 'AG3917'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(111, 'AG3920'), (191, 'AG3787'), (112, 'AG3919'), (113, 'AG3918'), (258, 'AG3917'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(437, 'AG3926'), (197, 'AG3928'), (333, 'AG3927')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG3932'), (9, 'AG3929'), (332, 'AG3931'), (436, 'AG3930')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3929'), (197, 'AG3928'), (191, 'AG3787'), (333, 'AG3927'), (437, 'AG3926')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23518 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3929'), (197, 'AG3928'), (191, 'AG3787'), (333, 'AG3927'), (437, 'AG3926')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23532 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(114, 'AG3941'), (255, 'AG3940'), (115, 'AG3936'), (116, 'AG3935'), (117, 'AG3934'), (331, 'AG3939'), (434, 'AG3938')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (115, 'AG3936'), (116, 'AG3935'), (435, 'AG3937')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23522 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (115, 'AG3936'), (116, 'AG3935'), (256, 'AG3933'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(115, 'AG3936'), (191, 'AG3787'), (116, 'AG3935'), (117, 'AG3934'), (256, 'AG3933'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(433, 'AG3942'), (198, 'AG3944'), (330, 'AG3943')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(46, 'AG3948'), (10, 'AG3945'), (329, 'AG3947'), (432, 'AG3946')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3945'), (198, 'AG3944'), (191, 'AG3787'), (330, 'AG3943'), (433, 'AG3942')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23533 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3945'), (198, 'AG3944'), (191, 'AG3787'), (330, 'AG3943'), (433, 'AG3942')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23547 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(118, 'AG3957'), (253, 'AG3956'), (119, 'AG3952'), (120, 'AG3951'), (121, 'AG3950'), (328, 'AG3955'), (430, 'AG3954')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (119, 'AG3952'), (120, 'AG3951'), (431, 'AG3953')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23537 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (119, 'AG3952'), (120, 'AG3951'), (254, 'AG3949'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(119, 'AG3952'), (191, 'AG3787'), (120, 'AG3951'), (121, 'AG3950'), (254, 'AG3949'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(429, 'AG3958'), (199, 'AG3960'), (327, 'AG3959')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(47, 'AG3964'), (11, 'AG3961'), (326, 'AG3963'), (428, 'AG3962')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3961'), (199, 'AG3960'), (191, 'AG3787'), (327, 'AG3959'), (429, 'AG3958')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23548 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3961'), (199, 'AG3960'), (191, 'AG3787'), (327, 'AG3959'), (429, 'AG3958')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(122, 'AG3973'), (251, 'AG3972'), (123, 'AG3968'), (124, 'AG3967'), (125, 'AG3966'), (325, 'AG3971'), (426, 'AG3970')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (123, 'AG3968'), (124, 'AG3967'), (427, 'AG3969')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23552 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (123, 'AG3968'), (124, 'AG3967'), (252, 'AG3965'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(123, 'AG3968'), (191, 'AG3787'), (124, 'AG3967'), (125, 'AG3966'), (252, 'AG3965'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(425, 'AG3974'), (200, 'AG3976'), (324, 'AG3975')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(48, 'AG3980'), (12, 'AG3977'), (323, 'AG3979'), (424, 'AG3978')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3977'), (200, 'AG3976'), (191, 'AG3787'), (324, 'AG3975'), (425, 'AG3974')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3977'), (200, 'AG3976'), (191, 'AG3787'), (324, 'AG3975'), (425, 'AG3974')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23577 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(126, 'AG3989'), (249, 'AG3988'), (127, 'AG3984'), (128, 'AG3983'), (129, 'AG3982'), (322, 'AG3987'), (422, 'AG3986')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (127, 'AG3984'), (128, 'AG3983'), (423, 'AG3985')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (127, 'AG3984'), (128, 'AG3983'), (250, 'AG3981'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(127, 'AG3984'), (191, 'AG3787'), (128, 'AG3983'), (129, 'AG3982'), (250, 'AG3981'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(421, 'AG3990'), (201, 'AG3992'), (321, 'AG3991')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(49, 'AG3996'), (13, 'AG3993'), (320, 'AG3995'), (420, 'AG3994')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3993'), (201, 'AG3992'), (191, 'AG3787'), (321, 'AG3991'), (421, 'AG3990')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23578 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3993'), (201, 'AG3992'), (191, 'AG3787'), (321, 'AG3991'), (421, 'AG3990')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(130, 'AG4005'), (247, 'AG4004'), (131, 'AG4000'), (132, 'AG3999'), (133, 'AG3998'), (319, 'AG4003'), (418, 'AG4002')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (131, 'AG4000'), (132, 'AG3999'), (419, 'AG4001')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23582 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (131, 'AG4000'), (132, 'AG3999'), (248, 'AG3997'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG4000'), (191, 'AG3787'), (132, 'AG3999'), (133, 'AG3998'), (248, 'AG3997'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(417, 'AG4006'), (202, 'AG4008'), (318, 'AG4007')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(50, 'AG4012'), (14, 'AG4009'), (317, 'AG4011'), (416, 'AG4010')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG4009'), (202, 'AG4008'), (191, 'AG3787'), (318, 'AG4007'), (417, 'AG4006')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG4009'), (202, 'AG4008'), (191, 'AG3787'), (318, 'AG4007'), (417, 'AG4006')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23607 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(134, 'AG4021'), (245, 'AG4020'), (135, 'AG4016'), (136, 'AG4015'), (137, 'AG4014'), (316, 'AG4019'), (414, 'AG4018')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (135, 'AG4016'), (136, 'AG4015'), (415, 'AG4017')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23597 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (135, 'AG4016'), (136, 'AG4015'), (246, 'AG4013'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(135, 'AG4016'), (191, 'AG3787'), (136, 'AG4015'), (137, 'AG4014'), (246, 'AG4013'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(413, 'AG4022'), (203, 'AG4024'), (315, 'AG4023')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(51, 'AG4028'), (15, 'AG4025'), (314, 'AG4027'), (412, 'AG4026')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG4025'), (203, 'AG4024'), (191, 'AG3787'), (315, 'AG4023'), (413, 'AG4022')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23608 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG4025'), (203, 'AG4024'), (191, 'AG3787'), (315, 'AG4023'), (413, 'AG4022')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23622 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(138, 'AG4037'), (243, 'AG4036'), (139, 'AG4032'), (140, 'AG4031'), (141, 'AG4030'), (313, 'AG4035'), (410, 'AG4034')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (139, 'AG4032'), (140, 'AG4031'), (411, 'AG4033')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23612 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (139, 'AG4032'), (140, 'AG4031'), (244, 'AG4029'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(139, 'AG4032'), (191, 'AG3787'), (140, 'AG4031'), (141, 'AG4030'), (244, 'AG4029'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(409, 'AG4038'), (204, 'AG4040'), (312, 'AG4039')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(52, 'AG4044'), (16, 'AG4041'), (311, 'AG4043'), (408, 'AG4042')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG4041'), (204, 'AG4040'), (191, 'AG3787'), (312, 'AG4039'), (409, 'AG4038')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23623 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG4041'), (204, 'AG4040'), (191, 'AG3787'), (312, 'AG4039'), (409, 'AG4038')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(142, 'AG4053'), (241, 'AG4052'), (143, 'AG4048'), (144, 'AG4047'), (145, 'AG4046'), (310, 'AG4051'), (406, 'AG4050')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (143, 'AG4048'), (144, 'AG4047'), (407, 'AG4049')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23627 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (143, 'AG4048'), (144, 'AG4047'), (242, 'AG4045'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(143, 'AG4048'), (191, 'AG3787'), (144, 'AG4047'), (145, 'AG4046'), (242, 'AG4045'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(405, 'AG4054'), (205, 'AG4056'), (309, 'AG4055')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(53, 'AG4060'), (17, 'AG4057'), (308, 'AG4059'), (404, 'AG4058')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4057'), (205, 'AG4056'), (191, 'AG3787'), (309, 'AG4055'), (405, 'AG4054')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4057'), (205, 'AG4056'), (191, 'AG3787'), (309, 'AG4055'), (405, 'AG4054')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23652 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(146, 'AG4069'), (239, 'AG4068'), (147, 'AG4064'), (148, 'AG4063'), (149, 'AG4062'), (307, 'AG4067'), (402, 'AG4066')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (147, 'AG4064'), (148, 'AG4063'), (403, 'AG4065')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23642 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (147, 'AG4064'), (148, 'AG4063'), (240, 'AG4061'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(147, 'AG4064'), (191, 'AG3787'), (148, 'AG4063'), (149, 'AG4062'), (240, 'AG4061'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(401, 'AG4070'), (206, 'AG4072'), (306, 'AG4071')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(54, 'AG4076'), (18, 'AG4073'), (305, 'AG4075'), (400, 'AG4074')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4073'), (206, 'AG4072'), (191, 'AG3787'), (306, 'AG4071'), (401, 'AG4070')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4073'), (206, 'AG4072'), (191, 'AG3787'), (306, 'AG4071'), (401, 'AG4070')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23667 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(150, 'AG4085'), (237, 'AG4084'), (151, 'AG4080'), (152, 'AG4079'), (153, 'AG4078'), (304, 'AG4083'), (398, 'AG4082')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (151, 'AG4080'), (152, 'AG4079'), (399, 'AG4081')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (151, 'AG4080'), (152, 'AG4079'), (238, 'AG4077'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(151, 'AG4080'), (191, 'AG3787'), (152, 'AG4079'), (153, 'AG4078'), (238, 'AG4077'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(397, 'AG4086'), (207, 'AG4088'), (303, 'AG4087')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(55, 'AG4092'), (19, 'AG4089'), (302, 'AG4091'), (396, 'AG4090')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4089'), (207, 'AG4088'), (191, 'AG3787'), (303, 'AG4087'), (397, 'AG4086')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23668 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4089'), (207, 'AG4088'), (191, 'AG3787'), (303, 'AG4087'), (397, 'AG4086')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23682 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(154, 'AG4101'), (235, 'AG4100'), (155, 'AG4096'), (156, 'AG4095'), (157, 'AG4094'), (301, 'AG4099'), (394, 'AG4098')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (155, 'AG4096'), (156, 'AG4095'), (395, 'AG4097')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (155, 'AG4096'), (156, 'AG4095'), (236, 'AG4093'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(155, 'AG4096'), (191, 'AG3787'), (156, 'AG4095'), (157, 'AG4094'), (236, 'AG4093'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(393, 'AG4102'), (208, 'AG4104'), (300, 'AG4103')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(56, 'AG4108'), (20, 'AG4105'), (299, 'AG4107'), (392, 'AG4106')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4105'), (208, 'AG4104'), (191, 'AG3787'), (300, 'AG4103'), (393, 'AG4102')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23683 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4105'), (208, 'AG4104'), (191, 'AG3787'), (300, 'AG4103'), (393, 'AG4102')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(158, 'AG4117'), (233, 'AG4116'), (159, 'AG4112'), (160, 'AG4111'), (161, 'AG4110'), (298, 'AG4115'), (390, 'AG4114')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (159, 'AG4112'), (160, 'AG4111'), (391, 'AG4113')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23687 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (159, 'AG4112'), (160, 'AG4111'), (234, 'AG4109'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(159, 'AG4112'), (191, 'AG3787'), (160, 'AG4111'), (161, 'AG4110'), (234, 'AG4109'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(389, 'AG4118'), (209, 'AG4120'), (297, 'AG4119')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(57, 'AG4124'), (21, 'AG4121'), (296, 'AG4123'), (388, 'AG4122')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4121'), (209, 'AG4120'), (191, 'AG3787'), (297, 'AG4119'), (389, 'AG4118')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4121'), (209, 'AG4120'), (191, 'AG3787'), (297, 'AG4119'), (389, 'AG4118')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23712 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(162, 'AG4133'), (231, 'AG4132'), (163, 'AG4128'), (164, 'AG4127'), (165, 'AG4126'), (295, 'AG4131'), (386, 'AG4130')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (163, 'AG4128'), (164, 'AG4127'), (387, 'AG4129')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (163, 'AG4128'), (164, 'AG4127'), (232, 'AG4125'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(163, 'AG4128'), (191, 'AG3787'), (164, 'AG4127'), (165, 'AG4126'), (232, 'AG4125'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(385, 'AG4134'), (210, 'AG4136'), (294, 'AG4135')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(58, 'AG4140'), (22, 'AG4137'), (293, 'AG4139'), (384, 'AG4138')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4137'), (210, 'AG4136'), (191, 'AG3787'), (294, 'AG4135'), (385, 'AG4134')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4137'), (210, 'AG4136'), (191, 'AG3787'), (294, 'AG4135'), (385, 'AG4134')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23727 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(166, 'AG4149'), (229, 'AG4148'), (167, 'AG4144'), (168, 'AG4143'), (169, 'AG4142'), (292, 'AG4147'), (382, 'AG4146')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (167, 'AG4144'), (168, 'AG4143'), (383, 'AG4145')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (167, 'AG4144'), (168, 'AG4143'), (230, 'AG4141'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(167, 'AG4144'), (191, 'AG3787'), (168, 'AG4143'), (169, 'AG4142'), (230, 'AG4141'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(381, 'AG4150'), (211, 'AG4152'), (291, 'AG4151')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(59, 'AG4156'), (23, 'AG4153'), (290, 'AG4155'), (380, 'AG4154')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4153'), (211, 'AG4152'), (191, 'AG3787'), (291, 'AG4151'), (381, 'AG4150')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4153'), (211, 'AG4152'), (191, 'AG3787'), (291, 'AG4151'), (381, 'AG4150')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23742 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(170, 'AG4165'), (227, 'AG4164'), (171, 'AG4160'), (172, 'AG4159'), (173, 'AG4158'), (289, 'AG4163'), (378, 'AG4162')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (171, 'AG4160'), (172, 'AG4159'), (379, 'AG4161')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23732 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (171, 'AG4160'), (172, 'AG4159'), (228, 'AG4157'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(171, 'AG4160'), (191, 'AG3787'), (172, 'AG4159'), (173, 'AG4158'), (228, 'AG4157'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(377, 'AG4166'), (212, 'AG4168'), (288, 'AG4167')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(60, 'AG4172'), (24, 'AG4169'), (287, 'AG4171'), (376, 'AG4170')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4169'), (212, 'AG4168'), (191, 'AG3787'), (288, 'AG4167'), (377, 'AG4166')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23743 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4169'), (212, 'AG4168'), (191, 'AG3787'), (288, 'AG4167'), (377, 'AG4166')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23757 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(174, 'AG4181'), (225, 'AG4180'), (175, 'AG4176'), (176, 'AG4175'), (177, 'AG4174'), (286, 'AG4179'), (374, 'AG4178')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (175, 'AG4176'), (176, 'AG4175'), (375, 'AG4177')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23747 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (175, 'AG4176'), (176, 'AG4175'), (226, 'AG4173'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(175, 'AG4176'), (191, 'AG3787'), (176, 'AG4175'), (177, 'AG4174'), (226, 'AG4173'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(373, 'AG4182'), (213, 'AG4184'), (285, 'AG4183')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(61, 'AG4188'), (25, 'AG4185'), (284, 'AG4187'), (372, 'AG4186')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4185'), (213, 'AG4184'), (191, 'AG3787'), (285, 'AG4183'), (373, 'AG4182')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23758 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4185'), (213, 'AG4184'), (191, 'AG3787'), (285, 'AG4183'), (373, 'AG4182')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23772 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(178, 'AG4197'), (223, 'AG4196'), (179, 'AG4192'), (180, 'AG4191'), (181, 'AG4190'), (283, 'AG4195'), (370, 'AG4194')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (179, 'AG4192'), (180, 'AG4191'), (371, 'AG4193')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23762 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (179, 'AG4192'), (180, 'AG4191'), (224, 'AG4189'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(179, 'AG4192'), (191, 'AG3787'), (180, 'AG4191'), (181, 'AG4190'), (224, 'AG4189'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(369, 'AG4198'), (214, 'AG4200'), (282, 'AG4199')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(62, 'AG4204'), (26, 'AG4201'), (281, 'AG4203'), (368, 'AG4202')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4201'), (214, 'AG4200'), (191, 'AG3787'), (282, 'AG4199'), (369, 'AG4198')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23773 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4201'), (214, 'AG4200'), (191, 'AG3787'), (282, 'AG4199'), (369, 'AG4198')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23787 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(182, 'AG4213'), (221, 'AG4212'), (183, 'AG4208'), (184, 'AG4207'), (185, 'AG4206'), (280, 'AG4211'), (366, 'AG4210')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (183, 'AG4208'), (184, 'AG4207'), (367, 'AG4209')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23777 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (183, 'AG4208'), (184, 'AG4207'), (222, 'AG4205'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(183, 'AG4208'), (191, 'AG3787'), (184, 'AG4207'), (185, 'AG4206'), (222, 'AG4205'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(365, 'AG4214'), (215, 'AG4216'), (279, 'AG4215')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(63, 'AG4220'), (27, 'AG4217'), (278, 'AG4219'), (364, 'AG4218')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4217'), (215, 'AG4216'), (191, 'AG3787'), (279, 'AG4215'), (365, 'AG4214')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23788 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4217'), (215, 'AG4216'), (191, 'AG3787'), (279, 'AG4215'), (365, 'AG4214')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23802 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(186, 'AG4229'), (219, 'AG4228'), (187, 'AG4224'), (188, 'AG4223'), (189, 'AG4222'), (277, 'AG4227'), (362, 'AG4226')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (187, 'AG4224'), (188, 'AG4223'), (363, 'AG4225')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23792 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (187, 'AG4224'), (188, 'AG4223'), (220, 'AG4221'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(187, 'AG4224'), (191, 'AG3787'), (188, 'AG4223'), (189, 'AG4222'), (220, 'AG4221'), (472, 'AG3788')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(361, 'AG4230'), (216, 'AG4232'), (276, 'AG4231')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(64, 'AG4236'), (28, 'AG4233'), (275, 'AG4235'), (360, 'AG4234')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4233'), (216, 'AG4232'), (191, 'AG3787'), (276, 'AG4231'), (361, 'AG4230')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23803 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4233'), (216, 'AG4232'), (191, 'AG3787'), (276, 'AG4231'), (361, 'AG4230')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23339 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(190, 'AG4238'), (217, 'AG4237'), (191, 'AG3787')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23807 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3785'), (359, 'AG3786')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23385 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (77, 'AG3784')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23386 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3787'), (77, 'AG3784')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16621 of IO tensor non_local float32 %get_tuple_element.3(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4252'), (33, 'AG4251')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23361 of IO tensor non_local uint32 %get_tuple_element.4(8, 2, 128) is not sorted, index list (w/ AG ids): [(69, 'AG4245'), (32, 'AG4249')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16596 of IO tensor non_local int32 %gather.2|NC|(8, 256) is not sorted, index list (w/ AG ids): [(69, 'AG4245'), (32, 'AG4249')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16639 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4252'), (33, 'AG4251')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16656 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4252'), (33, 'AG4251')] +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 2.639 seconds +2025-11-04T21:39:46Z INFO 8853 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.278 seconds +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.292 seconds +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.068 seconds +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.032 seconds +2025-11-04T21:39:47Z INFO 8853 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:39:50Z INFO 8853 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:39:50Z INFO 8853 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.602 seconds +2025-11-04T21:39:50Z INFO 8853 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 5.951 seconds +2025-11-04T21:39:50Z INFO 8853 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.902 seconds +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.126 seconds +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.332 seconds +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 41.185 seconds +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8853 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8853 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:39:52Z INFO 8853 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.487 seconds +2025-11-04T21:39:52Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:52Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:52Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.327 seconds +2025-11-04T21:39:52Z INFO 8853 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:39:52Z INFO 8853 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 1.656 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.067 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.724 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.300 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.301 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/LICM]: LICM finished after 0.087 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.045 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.166 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.120 seconds +2025-11-04T21:39:54Z INFO 8853 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 3.174 seconds +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.093 seconds +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.347 seconds +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.348 seconds +2025-11-04T21:39:58Z INFO 8853 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:39:59Z INFO 8853 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:39:59Z INFO 8853 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.697 seconds +2025-11-04T21:39:59Z INFO 8853 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:39:59Z INFO 8853 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:39:59Z INFO 8853 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.424 seconds +2025-11-04T21:39:59Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:59Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.512 seconds +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.509 seconds +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 1.022 seconds +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.073 seconds +2025-11-04T21:40:00Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.226 seconds +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.088 seconds +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.032 seconds +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.111 seconds +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.289 seconds +2025-11-04T21:40:01Z INFO 8853 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:40:02Z INFO 8853 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:40:02Z INFO 8853 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.356 seconds +2025-11-04T21:40:02Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:40:02Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.497 seconds +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.498 seconds +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.073 seconds +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.296 seconds +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/LICM]: LICM finished after 0.147 seconds +2025-11-04T21:40:03Z INFO 8853 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.836 seconds +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.004 seconds +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.020 seconds +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.035 seconds +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.079 seconds +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.073 seconds +2025-11-04T21:40:05Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:06Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:40:06Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.349 seconds +2025-11-04T21:40:06Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:40:06Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:40:06Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.596 seconds +2025-11-04T21:40:06Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.270 seconds +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.247 seconds +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.241 seconds +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_4 +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_4 finished after 0.243 seconds +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 1.598 seconds +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.064 seconds +2025-11-04T21:40:07Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:08Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:08Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.328 seconds +2025-11-04T21:40:08Z INFO 8853 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:08Z INFO 8853 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:40:08Z INFO 8853 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.497 seconds +2025-11-04T21:40:08Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:08Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:09Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 1.033 seconds +2025-11-04T21:40:09Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.193 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_2 +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_2 finished after 0.170 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.408 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.119 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.189 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.179 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.375 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.090 seconds +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:40:10Z INFO 8853 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.639 seconds +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.049 seconds +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.689 seconds +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.278 seconds +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.050 seconds +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.054 seconds +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.054 seconds +2025-11-04T21:40:11Z INFO 8853 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.083 seconds +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.718 seconds +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.718 seconds +2025-11-04T21:40:12Z INFO 8853 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:40:13Z INFO 8853 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:40:13Z INFO 8853 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.081 seconds +2025-11-04T21:40:13Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:13Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.514 seconds +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.514 seconds +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.061 seconds +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:40:14Z INFO 8853 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:40:15Z INFO 8853 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 1.131 seconds +2025-11-04T21:40:15Z INFO 8853 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:40:15Z INFO 8853 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 1.132 seconds +2025-11-04T21:40:15Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:15Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:15Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.186 seconds +2025-11-04T21:40:15Z INFO 8853 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:16Z INFO 8853 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8853 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.522 seconds +2025-11-04T21:40:16Z INFO 8853 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:16Z INFO 8853 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:16Z INFO 8853 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.066 seconds +2025-11-04T21:40:16Z INFO 8853 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:16Z INFO 8853 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.945 seconds +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.174 seconds +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 1.127 seconds +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.051 seconds +2025-11-04T21:40:17Z INFO 8853 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:18Z INFO 8853 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:18Z INFO 8853 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.619 seconds +2025-11-04T21:40:18Z INFO 8853 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.301 seconds +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.057 seconds +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.144 seconds +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.269 seconds +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:19Z INFO 8853 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:20Z INFO 8853 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.716 seconds +2025-11-04T21:40:20Z INFO 8853 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:40:21Z INFO 8853 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.617 seconds +2025-11-04T21:40:21Z INFO 8853 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:40:21Z INFO 8853 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.333 seconds +2025-11-04T21:40:21Z INFO 8853 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:21Z INFO 8853 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:21Z INFO 8853 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.058 seconds +2025-11-04T21:40:21Z INFO 8853 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.972 seconds +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.117 seconds +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.167 seconds +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.277 seconds +2025-11-04T21:40:22Z INFO 8853 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.087 seconds +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.035 seconds +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.340 seconds +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.040 seconds +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.156 seconds +2025-11-04T21:40:23Z INFO 8853 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:27Z INFO 8853 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:29Z INFO 8853 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 2.041 seconds +2025-11-04T21:40:29Z INFO 8853 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:40:29Z INFO 8853 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.215 seconds +2025-11-04T21:40:29Z INFO 8853 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:29Z INFO 8853 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 6.364 seconds +2025-11-04T21:40:29Z INFO 8853 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.152 seconds +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.174 seconds +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.067 seconds +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.125 seconds +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 15.174% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'38284.56587'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1,i2.16,i1.128] # id=56586, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_38284 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 230.909us (2.344MiB, est bw: 10.643GB/s, 2.301% of tot. time) for float32<8 x 128> non_local float32 (8, 2, 37984) %'convert.656'[i1.8,i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1] = store float32<8 x 128> TongaSB partitions[2] float32 (2, 297, 8, 128) %'38797.56597'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i1.8,i0.128] # id=56595, src_id=None, , instances=600 # dl = tensor_op_name: convert.656_pftranspose_38797 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i1.8];[i0.128]] -> [[i1.8];[i0.128]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 138.776us (32.031MiB, est bw: 242.024GB/s, 1.383% of tot. time) for bfloat16<128 x 8200> TongaSB partitions[2] bfloat16 (2, 8, 128, 8200) %'all_gather.1_nostride_60811'(init=0.0)[i242_0_0_43110,T_i2,i0.128,i1.8200] = load bfloat16<128 x 8200> non_local bfloat16 (16384,) %'all_gather.1'[8i0.128+1024T_i2+i1.8200] # id=48444, src_id=None, , attrs={'can_read_uninit': True}, instances=16 # dl = tensor_op_name: _add.383 | hlo_id: 383 | [[i0.128];[i1.8200]] -> [[i0.128];[i1.8200]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.586% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input68_local_40205'[i242_0_0_43110,4i243_0_0_0+i243_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input68'[4i243_0_0_0+i243_0_0_1,i242_0_0_43110,i0.128,i1.3072] # id=48459, src_id=None, , instances=16 # dl = tensor_op_name: _dot.413 | hlo_id: 13522 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.586% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input79_local_40307'[i414_0_0_43184,4i415_0_0_0+i415_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input79'[4i415_0_0_0+i415_0_0_1,i414_0_0_43184,i0.128,i1.3072] # id=48653, src_id=None, , instances=16 # dl = tensor_op_name: _dot.757 | hlo_id: 13633 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.586% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input90_local_40409'[i586_0_0_43258,4i587_0_0_0+i587_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input90'[4i587_0_0_0+i587_0_0_1,i586_0_0_43258,i0.128,i1.3072] # id=48847, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1101 | hlo_id: 13744 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.586% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input101_local_40511'[i758_0_0_43332,4i759_0_0_0+i759_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input101'[4i759_0_0_0+i759_0_0_1,i758_0_0_43332,i0.128,i1.3072] # id=49041, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1445 | hlo_id: 13855 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.586% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input112_local_40613'[i930_0_0_43406,4i931_0_0_0+i931_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input112'[4i931_0_0_0+i931_0_0_1,i930_0_0_43406,i0.128,i1.3072] # id=49235, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1789 | hlo_id: 13966 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.586% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input123_local_40715'[i1102_0_0_43480,4i1103_0_0_0+i1103_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input123'[4i1103_0_0_0+i1103_0_0_1,i1102_0_0_43480,i0.128,i1.3072] # id=49429, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2133 | hlo_id: 14077 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.586% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input134_local_40817'[i1274_0_0_43554,4i1275_0_0_0+i1275_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input134'[4i1275_0_0_0+i1275_0_0_1,i1274_0_0_43554,i0.128,i1.3072] # id=49623, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2477 | hlo_id: 14188 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.079 seconds +2025-11-04T21:40:30Z INFO 8853 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.012 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.019 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.047 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.026 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 5004) %4(init=0.0)[i0.128,i1.4748] = load float32<128 x 4748> float32 (128, 4748) %6[i0.128,i1.4748] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 4748) %10[i0.128,i1.4748] = load float32<128 x 4748> float32 (8, 75968) %'inp'[i0.128,i1.4748] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 5.874% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.003 seconds +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.012 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.012 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.048 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.036 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.008 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.004 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.004 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.004 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 12.331% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 288) %4(init=0.0)[i0.128,i1.32] = load float32<128 x 32> float32 (128, 32) %6[i0.128,i1.32] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 32) %10[i0.128,i1.32] = load float32<128 x 32> float32 (8, 512) %'inp'[i0.128,i1.32] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.005 seconds +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.004 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:31Z INFO 8853 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8853 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 1.526 seconds +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.793 seconds +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.793 seconds +2025-11-04T21:40:32Z INFO 8853 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:40:32Z WARNING 8853 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 76.89 percent of all matmul computation +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.132 seconds +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.493 seconds +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.103 seconds +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.110 seconds +2025-11-04T21:40:33Z INFO 8853 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.217 seconds +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.627 seconds +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.628 seconds +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.142 seconds +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.162 seconds +2025-11-04T21:40:34Z INFO 8853 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:36Z INFO 8853 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8853 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.857 seconds +2025-11-04T21:40:38Z INFO 8853 [Tensorizer]: BirCodeGen estimate #instances=69306 in sg0000 +2025-11-04T21:40:38Z INFO 8853 [Tensorizer]: IR signature: a2230bcfe2bb95a6e5f4da88643e98eef7eac0375b9c41aae3e62db27ee6be5d for nc00/sg0000/TensorizerBIR +2025-11-04T21:40:38Z INFO 8853 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:40Z INFO 8853 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:40Z INFO 8853 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.823 seconds +2025-11-04T21:40:42Z INFO 8853 [Tensorizer]: BirCodeGen estimate #instances=69306 in sg0000 +2025-11-04T21:40:42Z INFO 8853 [Tensorizer]: IR signature: 795b433145d937e9801d11c39f23aae3d9b06cab47c7ec951ddc92df1c041d98 for nc01/sg0000/TensorizerBIR +2025-11-04T21:40:42Z INFO 8853 [Tensorizer]: Weights total number of bytes: 2810120 +2025-11-04T21:40:42Z INFO 8853 [Tensorizer]: Successfully built model. +2025-11-04T21:40:42Z USER 8853 [root/Tensorizer/Tensorizer]: Tensorizer finished after 124.323 seconds +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: End tensorization +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input0 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input1 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input2 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input3 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input4 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input5 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input6 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input7 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input8 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input9 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input10 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input11 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input12 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input13 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input14 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input15 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input16 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input17 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input18 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input19 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input20 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input21 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input22 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input23 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input24 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input25 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input26 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input27 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input28 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input29 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input30 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input31 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input32 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input33 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input34 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input35 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input36 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input37 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input38 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input39 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input40 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input41 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input42 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input43 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input44 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input45 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input46 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input47 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input48 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input49 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input50 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input51 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input52 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input53 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input54 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input55 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input56 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input57 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input58 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input59 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input60 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input61 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input62 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input63 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input64 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input65 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input66 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input67 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input68 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input69 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input70 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input71 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input72 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input73 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input74 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input75 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input76 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input77 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input78 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input79 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input80 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input81 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input82 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input83 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input84 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input85 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input86 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input87 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input88 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input89 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input90 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input91 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input92 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input93 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input94 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input95 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input96 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input97 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input98 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input99 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input100 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input101 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input102 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input103 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input104 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input105 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input106 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input107 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input108 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input109 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input110 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input111 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input112 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input113 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input114 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input115 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input116 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input117 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input118 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input119 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input120 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input121 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input122 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input123 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input124 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input125 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input126 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input127 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input128 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input129 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input130 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input131 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input132 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input133 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input134 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input135 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input136 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input137 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input138 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input139 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input140 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input141 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input142 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input143 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input144 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input145 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input146 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input147 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input148 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input149 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input150 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input151 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input152 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input153 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input154 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input155 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input156 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input157 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input158 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input159 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input160 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input161 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input162 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input163 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input164 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input165 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input166 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input167 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input168 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input169 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input170 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input171 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input172 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input173 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input174 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input175 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input176 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input177 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input178 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input179 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input180 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input181 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input182 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input183 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input184 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input185 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input186 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input187 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input188 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input189 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input190 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input191 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input192 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input193 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input194 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input195 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input196 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input197 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input198 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input199 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input200 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input201 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input202 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input203 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input204 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input205 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input206 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input207 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input208 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input209 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input210 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input211 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input212 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input213 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input214 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input215 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input216 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input217 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input218 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input219 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input220 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input221 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input222 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input223 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input224 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input225 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input226 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input227 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input228 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input229 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input230 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input231 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input232 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input233 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input234 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input235 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input236 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input237 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input238 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input239 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input240 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input241 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input242 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input243 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input244 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input245 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input246 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input247 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input248 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input249 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input250 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input251 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input252 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input253 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input254 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input255 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input256 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input257 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input258 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input259 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input260 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input261 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input262 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input263 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input264 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input265 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input266 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input267 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input268 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input269 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input270 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input271 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input272 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input273 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input274 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input275 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input276 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input277 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input278 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input279 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input280 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input281 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input282 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input283 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input284 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input285 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input286 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input287 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input288 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input289 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input290 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input291 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input292 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input293 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input294 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input295 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input296 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input297 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input298 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input299 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input300 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input301 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input302 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input303 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input304 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input305 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input306 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input307 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input308 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input309 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input310 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input311 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input312 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input313 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input314 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input315 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input316 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input317 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input318 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input319 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input320 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input321 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input322 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input323 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input324 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input325 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input326 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input327 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input328 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input329 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input330 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input331 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input332 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input333 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input334 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input335 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input336 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input337 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input338 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input339 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input340 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input341 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input342 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input343 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input344 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input345 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input346 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input347 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input348 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input349 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input350 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input351 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input352 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input353 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input354 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input355 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input356 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input357 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input358 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input359 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input360 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input361 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input362 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input363 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input364 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input365 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input366 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input367 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input368 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input369 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Network input: input370 +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:42Z INFO 8853 [job.Frontend.0]: Job #0 finished +2025-11-04T21:40:42Z INFO 8853 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:40:42Z INFO 8853 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:40:42Z INFO 8853 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:40:42Z INFO 8853 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: BackendDriver has 2 states with 2 core LNC +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: BackendDriver: no partitions within VNC found. Switching to VNC + flat flow. +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: BackendDriver in_state.num_states 2 with 2 core LNC +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/log-neuron-cc.txt --vnc-nc-per-sengine 2 --link-subgraphs nc00/sg00,nc01/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --unified-backend-and-legacy-codegen --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels io,spill_reload,vector_dynamic_offsets,scalar_dynamic_offset --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:40:42Z INFO 8853 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:40:42Z INFO 9505 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:40:42Z INFO 9505 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:40:42Z INFO 9505 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:40:42Z INFO 9505 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:40:42Z INFO 9505 [BackendDriver]: Backend driver mtBackend: false numModules: 2 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg" +2025-11-04T21:40:42Z INFO 9505 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:40:42Z INFO 9505 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:40:42Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:42Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12114 blocks=2 instructions=10388 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:42Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:42Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:42Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:42Z USER 9505 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:40:42Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:42Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:42Z USER 9505 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.001 seconds +2025-11-04T21:40:42Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 373mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:42Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6057 memory location(s), 1 block(s), and 5194 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:40:42Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:42Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6057 memory location(s), 1 block(s), and 5194 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:40:42Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:42Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:42Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:42Z WARNING 9505 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.363.63823}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:42Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.363.63823}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:43Z USER 9505 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.116 seconds +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 538mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6057 memory location(s), 1 block(s), and 5194 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.125 seconds +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 543mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6057 memory location(s), 1 block(s), and 5194 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:43Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.134 seconds +2025-11-04T21:40:43Z INFO 9505 [BackendPassManager]: curr_vmrss: 543mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:43Z USER 9505 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:43Z INFO 9505 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=12114 blocks=2 instructions=10388 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:43Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=12114 blocks=2 instructions=10388 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.004 seconds +2025-11-04T21:40:43Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 543mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:43Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 12114 memory location(s), 2 block(s), and 10388 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:43Z USER 9505 [BackendPassManager]: subgraph_parallel_pass finished after 0.008 seconds +2025-11-04T21:40:43Z INFO 9505 [BackendPassManager]: curr_vmrss: 543mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:43Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:43Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12114 blocks=2 instructions=10388 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:43Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:43Z USER 9505 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 543mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:43Z USER 9505 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.001 seconds +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 543mb, ru_maxrss: 911mb (delta=0mb) +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6057 memory location(s), 1 block(s), and 5194 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6057 memory location(s), 1 block(s), and 5194 instruction(s). Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:43 2025 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6057 blocks=1 instructions=5194 Max writers: 49 Max Readers: 341 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:43 2025 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:43 2025 + +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Total count: 56271 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Matmult: 44730 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: GenericCopy: 4952 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: TensorScalarPtr: 1743 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Load: 1384 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: TensorTensor: 1218 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Save: 338 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Memset: 245 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: TensorReduce: 63 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Iota: 58 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Select: 30 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: DMACopy: 10 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 10 +2025-11-04T21:40:43Z USER 9505 (nc01/sg00) [ModuleForkPass]: unroll finished after 0.892 seconds +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1278mb, ru_maxrss: 1278mb (delta=367mb) +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 30789 memory location(s), 1 block(s), and 56271 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:43Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:43 2025 + +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=30789 blocks=1 instructions=56271 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Total count: 57431 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Matmult: 44730 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: GenericCopy: 4952 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: TensorScalarPtr: 2303 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Load: 1384 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: TensorTensor: 1218 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Iota: 394 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Save: 378 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Memset: 245 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: DMACopy: 234 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: TensorReduce: 63 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Select: 30 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 234 +2025-11-04T21:40:43Z USER 9505 (nc00/sg00) [ModuleForkPass]: unroll finished after 0.924 seconds +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 887mb, ru_maxrss: 1278mb (delta=367mb) +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 30789 memory location(s), 1 block(s), and 57431 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:43Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:43Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=30789 blocks=1 instructions=57431 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:43Z INFO 9505 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.134 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 889mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.127 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 889mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 1.061 seconds +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: curr_vmrss: 889mb, ru_maxrss: 1278mb (delta=367mb) +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25256 blocks=2 instructions=112102 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:44Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=25256 blocks=2 instructions=112102 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.004 seconds +2025-11-04T21:40:44Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 889mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25256 memory location(s), 2 block(s), and 112102 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: subgraph_parallel_pass finished after 0.006 seconds +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: curr_vmrss: 889mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25256 blocks=2 instructions=112102 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47972_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47981_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47990_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47999_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t48008_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t48017_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t48026_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t48035_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t48044_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t48053_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t48062_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t48071_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t48080_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t48089_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t48098_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t48107_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t48116_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t48125_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t48134_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t48143_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t48152_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t48161_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t48170_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t48179_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t48188_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t48197_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t48206_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t48215_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45317_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:44Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45322_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.116 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 904mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.124 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.127 seconds +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25256 blocks=2 instructions=112102 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:44Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=25256 blocks=2 instructions=112102 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.014 seconds +2025-11-04T21:40:44Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25256 memory location(s), 2 block(s), and 112102 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: subgraph_parallel_pass finished after 0.017 seconds +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:44Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25256 blocks=2 instructions=112102 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.012 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.012 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.006 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.008 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.047 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.050 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.007 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z WARNING 9505 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 906mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 5 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.007 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 907mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z WARNING 9505 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 907mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:40:44Z INFO 9505 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:44Z INFO 9505 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:44Z INFO 9505 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:44Z INFO 9505 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.02 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.025 seconds +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.065 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 909mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.018 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.023 seconds +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.060 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 909mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.231 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 909mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.231 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 909mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.012 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 909mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.012 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 909mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.030 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 911mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.031 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 911mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.038 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.039 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.008 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.008 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.006 seconds +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11900 memory location(s), 1 block(s), and 54682 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=11900 blocks=1 instructions=54682 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.007 seconds +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 912mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13356 memory location(s), 1 block(s), and 57420 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=13356 blocks=1 instructions=57420 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: To Spill 2 multi-layer tensors +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: To Spill 3 multi-layer tensors +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:44Z INFO 9505 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc01/sg00) [build_flow_deps]: Allocs: 11906 instructions: 54688 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9505 (nc00/sg00) [build_flow_deps]: Allocs: 13364 instructions: 57424 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 142705 edges +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [build_flow_deps]: Done build fdeps 142705 Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 156970 edges +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [build_flow_deps]: Done build fdeps 156970 Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.446 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 940mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.445 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 940mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11906 memory location(s), 1 block(s), and 54688 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=11906 blocks=1 instructions=54688 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13364 memory location(s), 1 block(s), and 57424 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=13364 blocks=1 instructions=57424 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 29 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 59 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.129 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11877 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=11877 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11878 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.131 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=11878 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13305 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=13305 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13306 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=13306 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.007 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.007 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.007 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.007 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 941mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: size = 5294 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: size = 5466 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: found 9981 edges +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: mean: 3.77068 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: median: 5.74971 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 79848 bytes +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: found 10039 edges +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: mean: 3.67325 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: median: 5.05628 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 80312 bytes +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: lo = 5192 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: hi = 102 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: total = 5294 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: lo = 5364 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: hi = 102 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: total = 5466 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.193 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.196 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.085 seconds +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.087 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 944mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 94 PSUM Banks +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 123 PSUM Banks +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 188 PSUM Banks +2025-11-04T21:40:45Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 333 PSUM Banks +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 168 PSUM Banks +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.375 seconds +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 948mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:45Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:45Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 485 PSUM Banks +2025-11-04T21:40:46Z USER 9505 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.393 seconds +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 948mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:46Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 946343606 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5756 bytes +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2804842 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 603 bytes +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 941243982 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5755 bytes +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1772544 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 463 bytes +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: size = 7260 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: size = 6012 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: found 4903 accumulation groups +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: largest = _dot.9689-t45101_i11 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: found 4731 accumulation groups +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: largest = _dot.9689-t45101_i23 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:46Z INFO 9505 []: find first defs for local +2025-11-04T21:40:46Z INFO 9505 []: find first defs for local +2025-11-04T21:40:46Z INFO 9505 []: find first defs for global +2025-11-04T21:40:46Z INFO 9505 []: find first defs for global +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: 1359 remat count +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: 1368 remat count +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Num intervals 6012 Num locations 6012 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: edge: 68218 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: mean: 22.6939 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: median: 14.0409 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: safe = 5992 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: unsafe = 16 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: inf = 2 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: total = 6010 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 6012 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Num intervals 7260 Num locations 7260 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: edge: 90706 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: mean: 24.9879 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: median: 15.8293 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Total: 6010 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (6010) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Rover zone: 0.896 (5382) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.098 (590) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.006 (38) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.204 (1228) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.001 (5) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.318 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.100 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.100 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.795 (4777) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.688 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: safe = 7236 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: unsafe = 20 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: inf = 2 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: total = 7258 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 7260 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 941243982 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5755 bytes +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1772544 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 463 bytes +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:46Z USER 9505 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.439 seconds +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 968mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:46Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Total: 7258 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (7258) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Rover zone: 0.891 (6468) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.066 (476) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.043 (310) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Slice zone: 0.001 (4) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.168 (1222) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.086 (626) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.284 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.293 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.519 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.745 (5410) +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.570 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.438 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:46Z INFO 9505 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:46Z USER 9505 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.090 seconds +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 968mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11879 memory location(s), 1 block(s), and 54659 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:46Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=11879 blocks=1 instructions=54659 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 943016526, 97.4917% input load, 0% output write, 2.50827% spill/reload [sg0000] +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(9.19363e+08) +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload instructions +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload memory locations +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:40:46Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 2048, 0.00865838% out of total spill/reload dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 946343606 +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5756 bytes +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2804842 +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 603 bytes +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:47Z USER 9505 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 1.104 seconds +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 972mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:47Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:47Z USER 9505 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.138 seconds +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 973mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13307 memory location(s), 1 block(s), and 57365 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:47Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=13307 blocks=1 instructions=57365 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 949148448, 97.1336% input load, 3.37144e-06% output write, 2.86644% spill/reload [sg0000] +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 5759 bytes +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 479 bytes +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 941242958 +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5759 bytes +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1771520 +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 479 bytes +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(9.21942e+08) +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 2048, 0.000217175% out of total dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 943014478, 97.4919% input load, 0% output write, 2.50806% spill/reload [sg0000] +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 941242958 +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5759 bytes +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1771520 +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 479 bytes +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 16416 +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 7 bytes +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 5564 bytes +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:47Z USER 9505 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 1.003 seconds +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 974mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54657 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:47Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=11875 blocks=1 instructions=54657 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 6 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 171 Sb address +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 6208, 0.0228178% out of total spill/reload dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 886 Sb address +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:47Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 0 SpillSaves and Reloads +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 5763 bytes +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 630 bytes +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 180 Sb address +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 946340502 +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5763 bytes +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2801738 +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 630 bytes +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 6208, 0.00065406% out of total dma traffic +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 949142240, 97.1342% input load, 3.37147e-06% output write, 2.86581% spill/reload [sg0000] +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 946340502 +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5763 bytes +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2801738 +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 630 bytes +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 482400 +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 85 bytes +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 5447 bytes +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:48Z USER 9505 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 1.035 seconds +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 976mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57359 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:48Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13299 blocks=1 instructions=57359 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 64 Sb address +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 187 Sb address +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1140 Sb address +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 910 Sb address +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 1.002 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 977mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54657 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=11875 blocks=1 instructions=54657 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: reserved space = 166144 bytes +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: spill space = 67584 bytes +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 69632 bytes +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: size = 2 +2025-11-04T21:40:48Z INFO 9505 []: find first defs for local +2025-11-04T21:40:48Z INFO 9505 []: find first defs for global +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: Num intervals 2 Num locations 2 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: lo = 2 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: total = 2 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 69632 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.191 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 977mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54657 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=11875 blocks=1 instructions=54657 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 67584 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 67584 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.097 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54657 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=11875 blocks=1 instructions=54657 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 1013 out of 4882 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.013 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54657 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=11875 blocks=1 instructions=54657 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.031 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 54687, number of allocs: 11875 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.012171 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.007 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.007 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.007 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.041 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.008 seconds +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:48Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47972_i1}@SB<0,17024>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47981_i1}@SB<0,31560>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47990_i1}@SB<0,17992>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47999_i1}@SB<0,30752>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t48008_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t48017_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t48026_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t48035_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t48044_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t48053_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t48062_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t48071_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t48080_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t48089_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t48098_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t48107_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t48116_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t48125_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t48134_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t48143_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t48152_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t48161_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t48170_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t48179_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t48188_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t48197_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t48206_i1}@SB<0,30152>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t48215_i1}@SB<0,27848>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45317_i1}@SB<96,19208>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:48Z WARNING 9505 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45322_i1}@SB<96,17416>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:48Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 266 Sb address +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.117 seconds +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.010 seconds +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 978mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [build_flow_deps]: Allocs: 11875 instructions: 54687 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 142704 edges +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [build_flow_deps]: Done build fdeps 142704 Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.187 seconds +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 979mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.038 seconds +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 979mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:49Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 38 Sb address +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.437 seconds +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1050mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.124 seconds +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=11875 blocks=1 instructions=54687 Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:49Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1241 Sb address +2025-11-04T21:40:49Z USER 9505 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.058 seconds +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 11875 memory location(s), 1 block(s), and 54687 instruction(s). Max writers: 298 Max Readers: 8525 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 1.749 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57359 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=13299 blocks=1 instructions=57359 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: reserved space = 166152 bytes +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: spill space = 98336 bytes +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 102400 bytes +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: size = 6 +2025-11-04T21:40:50Z INFO 9505 []: find first defs for local +2025-11-04T21:40:50Z INFO 9505 []: find first defs for global +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: Num intervals 6 Num locations 6 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: lo = 6 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: total = 6 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 73728 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.229 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57359 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=13299 blocks=1 instructions=57359 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 73728 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 73728 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.113 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57359 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=13299 blocks=1 instructions=57359 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 1125 out of 5127 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.019 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57359 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=13299 blocks=1 instructions=57359 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.037 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 57389, number of allocs: 13299 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.006815 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.008 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.008 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.008 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1051mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.068 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1052mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.012 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1052mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.142 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1052mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.011 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1052mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:40:50 2025 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [build_flow_deps]: Allocs: 13299 instructions: 57389 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 156935 edges +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [build_flow_deps]: Done build fdeps 156935 Tue Nov 4 21:40:50 2025 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.180 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1064mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.042 seconds +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1064mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:50Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:51Z USER 9505 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.435 seconds +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:51Z USER 9505 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.139 seconds +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=13299 blocks=1 instructions=57389 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.062 seconds +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13299 memory location(s), 1 block(s), and 57389 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:51Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 7.329 seconds +2025-11-04T21:40:51Z INFO 9505 [BackendPassManager]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z USER 9505 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:51Z INFO 9505 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25174 blocks=2 instructions=112076 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=25174 blocks=2 instructions=112076 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.006 seconds +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25174 memory location(s), 2 block(s), and 112076 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=25174 blocks=2 instructions=112076 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.061 seconds +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 112990 instruction(s). Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=25572 blocks=2 instructions=112990 Max writers: 298 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.227 seconds +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 112994 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:51Z USER 9505 [BackendPassManager]: subgraph_parallel_pass finished after 0.309 seconds +2025-11-04T21:40:51Z INFO 9505 [BackendPassManager]: curr_vmrss: 1109mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:51Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112994 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:51Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:40:51Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=12074 blocks=1 instructions=55146 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: reserved space = 233728 bytes +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:40:51Z USER 9505 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.057 seconds +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55146 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: reserved space = 264488 bytes +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:51Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: size = 132 +2025-11-04T21:40:51Z INFO 9505 []: find first defs for local +2025-11-04T21:40:52Z INFO 9505 []: find first defs for global +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: Num intervals 132 Num locations 132 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: lo = 132 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: total = 132 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 73728 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 73728 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 3842048 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 3842048 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 6295552 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:52Z USER 9505 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.203 seconds +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.208 seconds +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112994 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:40:52Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=25572 blocks=2 instructions=112994 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.003 seconds +2025-11-04T21:40:52Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 112994 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: subgraph_parallel_pass finished after 0.008 seconds +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112994 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:40:52Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=12074 blocks=1 instructions=55146 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:52Z USER 9505 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.044 seconds +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.057 seconds +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55146 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:52Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.061 seconds +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: curr_vmrss: 1110mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112994 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:40:52Z USER 9505 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:40:52Z INFO 9505 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=12074 blocks=1 instructions=55146 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:52Z INFO 9505 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.252 seconds +2025-11-04T21:40:52Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1111mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55146 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:52Z USER 9505 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.261 seconds +2025-11-04T21:40:52Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1111mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: nc_parallel_pass finished after 0.268 seconds +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: curr_vmrss: 1111mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:52Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112994 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:40:52Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1111mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=12074 blocks=1 instructions=55146 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:52Z USER 9505 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1111mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55146 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:52Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:40:52Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:52Z INFO 9505 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:40:52Z INFO 9505 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:40:52Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=12074 blocks=1 instructions=55146 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:52Z INFO 9505 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:40:52Z INFO 9505 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:40:52Z INFO 9505 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:40:52 2025 +2025-11-04T21:40:52Z INFO 9505 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:40:52 2025 +2025-11-04T21:40:52Z INFO 9505 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:40:53Z INFO 9505 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:40:54Z INFO 9505 [post_scheduler]: Time-aware simulation time: 5537105 +2025-11-04T21:40:55Z INFO 9505 [post_scheduler]: Time-aware simulation time: 5162965 +2025-11-04T21:40:55Z INFO 9505 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z USER 9505 (nc00/sg00) [ModuleForkPass]: post_sched finished after 2.820 seconds +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1231mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.013 seconds +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1111mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.059 seconds +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1112mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9505 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z USER 9505 (nc01/sg00) [ModuleForkPass]: post_sched finished after 2.917 seconds +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1112mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55146 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:55Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=12074 blocks=1 instructions=55146 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:55Z USER 9505 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.009 seconds +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1112mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55146 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:55Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=12074 blocks=1 instructions=55146 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:55Z USER 9505 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.059 seconds +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1112mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55114 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:55Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:55Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 2.995 seconds +2025-11-04T21:40:55Z INFO 9505 [BackendPassManager]: curr_vmrss: 1112mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z USER 9505 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:55Z INFO 9505 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112962 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:55Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=25572 blocks=2 instructions=112962 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.006 seconds +2025-11-04T21:40:55Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 1112mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 112962 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:55Z USER 9505 [BackendPassManager]: subgraph_parallel_pass finished after 0.011 seconds +2025-11-04T21:40:55Z INFO 9505 [BackendPassManager]: curr_vmrss: 1112mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:55Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:55Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112962 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:55Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:55Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:55Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=12074 blocks=1 instructions=55114 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:56Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 3151 PSUM Banks +2025-11-04T21:40:56Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2518 PSUM Banks +2025-11-04T21:40:56Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 269 PSUM Banks +2025-11-04T21:40:56Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 279 PSUM Banks +2025-11-04T21:40:56Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2497 PSUM Banks +2025-11-04T21:40:56Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 2042 PSUM Banks +2025-11-04T21:40:56Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 15 Sb address +2025-11-04T21:40:56Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-11-04T21:40:56Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 22 Sb address +2025-11-04T21:40:56Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 25 Sb address +2025-11-04T21:40:56Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 91 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 23 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 46 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 111 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 970 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 725 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: moved 3 MM forward +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: moved 3 MM forward +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:57Z USER 9505 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.229 seconds +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1116mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:57Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:57Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:40:57Z USER 9505 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.253 seconds +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1116mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55114 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:57Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=12074 blocks=1 instructions=55114 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:57Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.761 seconds +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1264mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.797 seconds +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1131mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55114 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=12074 blocks=1 instructions=55114 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.058 seconds +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1131mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:40:58 2025 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [build_flow_deps]: Allocs: 13498 instructions: 57848 +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.075 seconds +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1132mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55114 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=12074 blocks=1 instructions=55114 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:40:58 2025 +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [build_flow_deps]: Allocs: 12074 instructions: 55114 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 160555 edges +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [build_flow_deps]: Done build fdeps 160555 Tue Nov 4 21:40:58 2025 +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 145432 edges +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [build_flow_deps]: Done build fdeps 145432 Tue Nov 4 21:40:58 2025 +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.281 seconds +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 208 │ 139264 │ +│ DMACopy │ Internal -> ExternalOutput │ 224 │ 7516192768 │ +│ Load │ Const -> Internal │ 11 │ 2743560 │ +│ Load │ ExternalInput -> Internal │ 1282 │ 919197792 │ +│ Load │ Internal │ 77 │ 4451366 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 378 │ 2801706 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 32 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 72 │ +│ 8 │ 4 │ +│ 16 │ 4 │ +│ 32 │ 59 │ +│ 64 │ 7 │ +│ 88 │ 3 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 307 │ +│ 1024 │ 17 │ +│ 2048 │ 197 │ +│ 4096 │ 325 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 532 │ +│ 16384 │ 2 │ +│ 16400 │ 8 │ +│ 18992 │ 2 │ +│ 1048576 │ 224 │ +└─────────────────────┴───────┘ + +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 44726 #MatMult-Transposes 9321 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ReportStats]: IO Tensor size combined: 5789968576 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60705_i0 │ Internal │ bfloat16 │ 3153920 │ +│ -t80149 │ Internal │ float32 │ 2562048 │ +│ -t80143 │ Internal │ float32 │ 2562048 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ -t80146 │ Internal │ float32 │ 2430976 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ all_gather.1_nostride_60811_i3 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i2 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i1 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i0 │ Internal │ bfloat16 │ 2099200 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.023 seconds +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57848 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.282 seconds +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55114 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=12074 blocks=1 instructions=55114 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 177 │ 131072 │ +│ Load │ Const -> Internal │ 5 │ 165120 │ +│ Load │ ExternalInput -> Internal │ 1281 │ 919197760 │ +│ Load │ Internal │ 67 │ 1932294 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 335 │ 1771520 │ +└──────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 66 │ +│ 8 │ 3 │ +│ 16 │ 4 │ +│ 32 │ 60 │ +│ 64 │ 3 │ +│ 88 │ 3 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 305 │ +│ 1024 │ 2 │ +│ 2048 │ 197 │ +│ 4096 │ 297 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 532 │ +│ 16400 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 43882 #MatMult-Transposes 8537 +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ReportStats]: IO Tensor size combined: 5789968576 +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60705_i1 │ Internal │ bfloat16 │ 3153920 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ all_gather.1_nostride_60811_i10 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i12 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i9 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i11 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i14 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i13 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60811_i8 │ Internal │ bfloat16 │ 2099200 │ +└─────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.024 seconds +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55114 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:58Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 3.445 seconds +2025-11-04T21:40:58Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z USER 9505 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:40:58Z INFO 9505 [BackendPassManager]: Inputs to assign_trigger_engine: modules=2 functions=2 allocs=25572 blocks=2 instructions=112962 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 449 DMA instructions. Moved 71 DMA instructions to CC's engines. +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 405 DMA instructions. Moved 70 DMA instructions to CC's engines. +2025-11-04T21:40:58Z INFO 9505 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:40:58Z USER 9505 [BackendPassManager]: assign_trigger_engine finished after 0.074 seconds +2025-11-04T21:40:58Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 112962 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:58Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=112962 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:40:58Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=12074 blocks=1 instructions=55114 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=13498 blocks=1 instructions=57848 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:58Z USER 9505 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.020 seconds +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.030 seconds +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.035 seconds +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=2 functions=2 allocs=25572 blocks=2 instructions=113080 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: assign_hwdge_engine finished after 0.020 seconds +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 113080 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=113080 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 8 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 226 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 245 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 82 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 23 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 1287 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 4 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.011 seconds +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 17 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 259 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 266 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 98 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 31 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 1512 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 6 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.012 seconds +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.017 seconds +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.018 seconds +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.040 seconds +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=113080 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:40:59Z USER 9505 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:40:59Z INFO 9505 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:40:59Z INFO 9505 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z USER 9505 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:40:59Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z USER 9505 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: nc_parallel_pass finished after 0.003 seconds +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=113080 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:40:59Z USER 9505 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.084 seconds +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z USER 9505 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.102 seconds +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.108 seconds +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:40:59Z USER 9505 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:40:59Z INFO 9505 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=113080 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z USER 9505 (nc00) [CoreForkPass]: Running dep_reduction +2025-11-04T21:40:59Z USER 9505 (nc01) [CoreForkPass]: Running dep_reduction +2025-11-04T21:40:59Z INFO 9505 (nc00) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:40:59Z INFO 9505 (nc01) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 47402 +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 53934 +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 50166 +2025-11-04T21:40:59Z INFO 9505 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 50166 +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 57604 +2025-11-04T21:40:59Z INFO 9505 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 57604 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [DepReduction]: Finished dependency reduction: 327828 removed, new total 13983 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: dep_reduction finished after 0.858 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1213mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.027 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1167mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.038 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1170mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.010 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1171mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55173 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=12074 blocks=1 instructions=55173 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1164/1164 (100% DGE) + power-of-2 partition : 1164/1169 (99.5723% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1165/1170 (99.5726% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 27/27 (100% DGE) + power-of-2 partition : 27/419 (6.44391% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 27/419 (6.44391% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 169 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 9/9 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: lower_dma finished after 0.058 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1175mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55174 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=12074 blocks=1 instructions=55174 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: expand_all_engine finished after 0.018 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1178mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55174 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=12074 blocks=1 instructions=55174 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.098 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1197mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55174 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=12074 blocks=1 instructions=55174 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [DepReduction]: Finished dependency reduction: 403644 removed, new total 15691 +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: dep_reduction finished after 1.207 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1198mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.014 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1187mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: expand_inst_late finished after 0.113 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1186mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55183 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=12074 blocks=1 instructions=55183 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [SeqInstOpt]: Removing 7 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.010 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1186mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 55176 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=12074 blocks=1 instructions=55176 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.038 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1186mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.009 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1186mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57907 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=13498 blocks=1 instructions=57907 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: lower_sync finished after 0.032 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1186mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 57273 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=12074 blocks=1 instructions=57273 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: lower_act finished after 0.012 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1186mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 57414 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=12074 blocks=1 instructions=57414 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1164/1164 (100% DGE) + power-of-2 partition : 1164/1171 (99.4022% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1165/1172 (99.4027% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 37/37 (100% DGE) + power-of-2 partition : 37/478 (7.74059% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 37/478 (7.74059% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 197 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 234/234 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: lower_dma finished after 0.061 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1188mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57909 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=13498 blocks=1 instructions=57909 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: expand_all_engine finished after 0.018 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1190mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57909 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=13498 blocks=1 instructions=57909 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.103 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1203mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57909 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=13498 blocks=1 instructions=57909 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: lower_dve finished after 0.326 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1211mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 57414 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=12074 blocks=1 instructions=57414 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: expand_inst_late finished after 0.197 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1189mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 58143 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: lower_ap finished after 0.030 seconds +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1189mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 57414 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z USER 9505 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=13498 blocks=1 instructions=58143 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z INFO 9505 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=12074 blocks=1 instructions=57414 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [SeqInstOpt]: Removing 230 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.020 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1190mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 57913 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=13498 blocks=1 instructions=57913 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: lower_sync finished after 0.037 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1190mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 60521 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:41:00Z INFO 9505 (nc01/sg00) [REG_Allocator]: size = 2 +2025-11-04T21:41:00Z INFO 9505 []: find first defs for local reg +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=13498 blocks=1 instructions=60521 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: lower_act finished after 0.013 seconds +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1191mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 60663 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z USER 9505 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:00Z INFO 9505 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=13498 blocks=1 instructions=60663 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:00Z INFO 9505 (nc00/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:00Z INFO 9505 []: find first defs for global reg +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: lo = 2 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: total = 2 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:01Z USER 9505 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.245 seconds +2025-11-04T21:41:01Z INFO 9505 (nc01) [CoreForkPass]: curr_vmrss: 1223mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 57414 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:01Z USER 9505 (nc00) [CoreForkPass]: lower_dve finished after 0.241 seconds +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1212mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 60663 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=13498 blocks=1 instructions=60663 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (nc00) [CoreForkPass]: lower_ap finished after 0.014 seconds +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1182mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 60663 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=13498 blocks=1 instructions=60663 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: size = 4 +2025-11-04T21:41:01Z INFO 9505 []: find first defs for local reg +2025-11-04T21:41:01Z INFO 9505 []: find first defs for global reg +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: lo = 4 +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: total = 4 +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:01Z USER 9505 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.184 seconds +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: curr_vmrss: 1188mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 60663 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: nc_parallel_pass finished after 2.237 seconds +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: curr_vmrss: 1171mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: vnc_remote_addr_map finished after 0.006 seconds +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: curr_vmrss: 1143mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 118077 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: Running vnc_link +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z INFO 9505 [VncLink]: Found 0 remote updates +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: vnc_link finished after 0.002 seconds +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: curr_vmrss: 1143mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 118077 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:01Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=13498 blocks=1 instructions=60663 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=12074 blocks=1 instructions=57414 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:01Z USER 9505 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.158 seconds +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 60663 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.166 seconds +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 57414 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:01Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.170 seconds +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: curr_vmrss: 1150mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:41:01Z INFO 9505 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.025 seconds +2025-11-04T21:41:01Z INFO 9505 (sg00) [SubgraphForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9505 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 118077 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: subgraph_parallel_pass finished after 0.031 seconds +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: curr_vmrss: 1150mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:01Z USER 9505 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:01Z INFO 9505 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z USER 9505 (nc00/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:01Z USER 9505 (nc01/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=13498 blocks=1 instructions=60663 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=12074 blocks=1 instructions=57414 Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:01Z INFO 9505 (nc01/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64233 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249505 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:01Z INFO 9505 (nc00/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64233 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249506 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 44907 │ +│ LDWEIGHTS │ 44907 │ +│ CAST │ 3566 │ +│ EVENT_SEMAPHORE │ 2097 │ +│ ACTIVATE │ 2092 │ +│ UNKNOWN(0xd4) │ 1314 │ +│ TENSOR_TENSOR │ 1217 │ +│ COPY │ 1119 │ +│ UNKNOWN(0xd8) │ 589 │ +│ PSEUDO_DMA_TRIGGER │ 567 │ +│ TENSOR_SCALAR │ 258 │ +│ MEMSET │ 227 │ +│ ACT_TABLE_LOAD │ 141 │ +│ TENSOR_SCALAR_ADDR │ 113 │ +│ UNKNOWN(0xda) │ 68 │ +│ UNKNOWN(0xd9) │ 59 │ +│ TENSOR_REDUCE │ 58 │ +│ RECIPROCAL │ 57 │ +│ UNKNOWN(0xe8) │ 30 │ +│ STREAM_SHUFFLE │ 24 │ +│ LOAD_MASK_SELECT │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 2 │ +│ IOTA │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 3436 │ +│ Scalar │ 6696 │ +│ Tensor │ 90401 │ +│ SyncDMA │ 0 │ +│ Vector │ 2867 │ +│ Sync │ 47 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:41:02Z USER 9505 (nc01/sg00) [Codegen]: isa_gen finished after 0.423 seconds +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 4032 │ +│ qDVESpillReload0 │ 1312 │ +│ qPoolSpillReload0 │ 44805 │ +│ qSPIO0 │ 33 │ +│ qSPSpillReload0 │ 196 │ +└───────────────────┴────────────────┘ + +Total descriptors: 50378 (0.000750691 GB) +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [Codegen]: Tensors with largest descriptor count: +┌─────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────┼───────────────┼──────────┼──────────────────┤ +│ 38797.56597_i557 │ Internal │ float32 │ 1 │ +│ _dot.4655-t48081_i1 │ Internal │ bfloat16 │ 1 │ +│ dot.32-buffer-79829 │ Internal │ bfloat16 │ 1 │ +│ 38797.56597_i558 │ Internal │ float32 │ 1 │ +│ 38797.56597_i361 │ Internal │ float32 │ 1 │ +│ 38797.56597_i463 │ Internal │ float32 │ 1 │ +│ 38810.59681_i1 │ Internal │ float32 │ 1 │ +│ split_1 │ Internal │ float32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└─────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:02Z USER 9505 (nc01/sg00) [Codegen]: dma_desc_gen finished after 0.011 seconds +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:02Z WARNING 9505 (nc01/sg00) [Codegen]: Found 93 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:02Z USER 9505 (nc01/sg00) [Codegen]: debug_info_gen finished after 0.119 seconds +2025-11-04T21:41:02Z USER 9505 (nc01/sg00) [ModuleForkPass]: codegen finished after 0.571 seconds +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1247mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 12074 memory location(s), 1 block(s), and 57414 instruction(s). Max writers: 299 Max Readers: 8525 +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 45811 │ +│ LDWEIGHTS │ 45811 │ +│ CAST │ 3566 │ +│ EVENT_SEMAPHORE │ 2608 │ +│ ACTIVATE │ 2099 │ +│ UNKNOWN(0xd4) │ 1549 │ +│ COPY │ 1353 │ +│ TENSOR_TENSOR │ 1218 │ +│ TENSOR_SCALAR_ADDR │ 674 │ +│ PSEUDO_DMA_TRIGGER │ 648 │ +│ UNKNOWN(0xd8) │ 589 │ +│ IOTA │ 394 │ +│ UNKNOWN(0xda) │ 293 │ +│ TENSOR_SCALAR │ 260 │ +│ POOL_BUFFER_LOAD │ 240 │ +│ GATHER │ 240 │ +│ MEMSET │ 239 │ +│ ACT_TABLE_LOAD │ 142 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ TENSOR_REDUCE │ 63 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 59 │ +│ UNKNOWN(0xe8) │ 30 │ +│ LOAD_MASK_SELECT │ 25 │ +│ STREAM_SHUFFLE │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 4 │ +│ UNKNOWN(0xe5) │ 2 │ +│ STREAM_TRANSPOSE │ 1 │ +│ NOP │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [Codegen]: +┌────────────┬───────┐ +│ Engine │ Count │ +├────────────┼───────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 5696 │ +│ Scalar │ 6831 │ +│ Tensor │ 92262 │ +│ SyncDMA │ 0 │ +│ Vector │ 3662 │ +│ Sync │ 75 │ +│ All │ 0 │ +└────────────┴───────┘ + +2025-11-04T21:41:02Z USER 9505 (nc00/sg00) [Codegen]: isa_gen finished after 0.643 seconds +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 4228 │ +│ qDVESpillReload0 │ 1540 │ +│ qPoolSpillReload0 │ 55623 │ +│ qSPIO0 │ 51 │ +│ qSPSpillReload0 │ 193 │ +└───────────────────┴────────────────┘ + +Total descriptors: 61635 (0.000918433 GB) +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [Codegen]: Tensors with largest descriptor count: +┌─────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────┼───────────────┼──────────┼──────────────────┤ +│ get_tuple_element.6 │ Internal │ float32 │ 2 │ +│ input2 │ ExternalInput │ int32 │ 2 │ +│ rng.1 │ Internal │ float32 │ 2 │ +│ get_tuple_element.3 │ Internal │ float32 │ 2 │ +│ get_tuple_element.1 │ Internal │ float32 │ 2 │ +│ get_tuple_element.2 │ Internal │ uint32 │ 2 │ +│ all_gather.2 │ Internal │ float32 │ 2 │ +│ all_reduce.112 │ Internal │ bfloat16 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└─────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:02Z USER 9505 (nc00/sg00) [Codegen]: dma_desc_gen finished after 0.016 seconds +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:02Z WARNING 9505 (nc00/sg00) [Codegen]: Found 204 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:02Z USER 9505 (nc00/sg00) [Codegen]: debug_info_gen finished after 0.149 seconds +2025-11-04T21:41:02Z USER 9505 (nc00/sg00) [ModuleForkPass]: codegen finished after 0.835 seconds +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1208mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 13498 memory location(s), 1 block(s), and 60663 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:02Z USER 9505 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:02Z USER 9505 [BackendPassManager]: mod_parallel_pass finished after 0.853 seconds +2025-11-04T21:41:02Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:02Z USER 9505 [BackendPassManager]: Running hbm_usage +2025-11-04T21:41:02Z INFO 9505 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 480.000B │ 122.094KB │ +│ CCE │ 0.000B │ 674.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 2.000KB │ 160.000KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc00/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.658GB │ +│ Model Code │ 6.624MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.004MB │ +│ DMA Ring IO │ 2.469KB │ +│ DMA Ring Spill │ 956.766KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 416.000B │ 114.594KB │ +│ CCE │ 0.000B │ 506.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 1.500KB │ 140.250KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:02Z INFO 9505 (nc01/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.658GB │ +│ Model Code │ 6.314MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.004MB │ +│ DMA Ring IO │ 1.906KB │ +│ DMA Ring Spill │ 761.516KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:02Z INFO 9505 [HBMUsage]: Total estimated HBM usage is: 3.671GB +2025-11-04T21:41:02Z USER 9505 [BackendPassManager]: hbm_usage finished after 0.007 seconds +2025-11-04T21:41:02Z INFO 9505 [BackendPassManager]: curr_vmrss: 1134mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9505 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 118077 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:02Z USER 9505 [BackendPassManager]: Running neff_packager +2025-11-04T21:41:02Z INFO 9505 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=2 allocs=25572 blocks=2 instructions=118077 Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:02Z WARNING 9505 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg/metrics.json +2025-11-04T21:41:02Z WARNING 9505 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:41:02Z INFO 9505 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff +2025-11-04T21:41:02Z INFO 9505 [NeffFileWriter]: IR signature: 1f1e27d2de586461125789865a801f67 for neff artifacts +2025-11-04T21:41:02Z USER 9505 [BackendPassManager]: neff_packager finished after 0.443 seconds +2025-11-04T21:41:02Z INFO 9505 [BackendPassManager]: curr_vmrss: 1135mb, ru_maxrss: 1278mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9505 [BackendPassManager]: Output has 2 module(s), 2 function(s), 25572 memory location(s), 2 block(s), and 118077 instruction(s). Max writers: 299 Max Readers: 9309 +2025-11-04T21:41:02Z INFO 9505 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000069 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.005863 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000095 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.005882 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000069 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.005863 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000065 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000065 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000065 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.005863 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:41:02Z INFO 9505 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=local (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_3 │ float32 │ 1 │ 0.062500 MB │ +│ -t80409 │ float32 │ 1 │ 0.007812 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:02Z INFO 9505 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.656 │ float32 │ 1 │ 2.320312 MB │ +│ all_reduce.111 │ bfloat16 │ 1 │ 0.031250 MB │ +│ get_tuple_element.1 │ float32 │ 1 │ 0.007812 MB │ +│ get_tuple_element.2 │ uint32 │ 1 │ 0.007812 MB │ +│ all_reduce.112 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:02Z INFO 9505 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg00, addr_space=local (complete data located at nc01/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_2 │ float32 │ 1 │ 0.062500 MB │ +│ split_1 │ float32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:02Z INFO 9505 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:41:03Z INFO 8853 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:41:03Z INFO 8853 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:41:03Z INFO 8853 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg +2025-11-04T21:41:03Z INFO 8853 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:41:03Z INFO 8853 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:41:03Z INFO 8853 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:41:03Z INFO 8853 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:41:03Z INFO 8853 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:41:03Z INFO 8853 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg/hlo_netlist.json +2025-11-04T21:41:03Z INFO 8853 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk1/neuronxcc-epchk4hg/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:41:03Z INFO 8853 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:41:03Z INFO 8853 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:41:03Z INFO 8792 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk1/metaneff.pb b/token_generation_model/_tp0_bk1/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..0823477be67bb55c7b633504cb84841fb4e330db --- /dev/null +++ b/token_generation_model/_tp0_bk1/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9673a00cffa717f111c4bf44d2667c85a8d5140b1086eaab2d7068479950ee08 +size 3988817 diff --git a/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb b/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..65549c978b65a74558d6c7eeba20a1b4c66d4909 --- /dev/null +++ b/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1edd084900bc72d9299ca470a98a68f62a5de01d8951e905f7956ff1d143ad0f +size 4075105 diff --git a/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff b/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff new file mode 100644 index 0000000000000000000000000000000000000000..96245a45162e26d4f994ef20644d1d1eea4b346f --- /dev/null +++ b/token_generation_model/_tp0_bk1/model.MODULE_122f32d499d16ac150a0+bdebe6e1.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73cd1b99047c46f005f119317e5cec46b3ade159908bfc6f17487c1550536355 +size 6093824 diff --git a/token_generation_model/_tp0_bk1/neuron_config.json b/token_generation_model/_tp0_bk1/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11356d824f7d314097f63212bcbf44e5e271c003 --- /dev/null +++ b/token_generation_model/_tp0_bk1/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 8, + "bucket_n_active_tokens": false, + "buckets": [ + 256 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": [ + 256 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk2/command.txt b/token_generation_model/_tp0_bk2/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2942ad589ac7a9dc88398c695216bfbf00c2870 --- /dev/null +++ b/token_generation_model/_tp0_bk2/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb --output model.MODULE_bac42b9b464c64624582+1ea12800.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk2/compile_flags.MODULE_bac42b9b464c64624582+1ea12800.json b/token_generation_model/_tp0_bk2/compile_flags.MODULE_bac42b9b464c64624582+1ea12800.json new file mode 100644 index 0000000000000000000000000000000000000000..76fbb1f68483f7b4dd48043817e127738144a5c3 --- /dev/null +++ b/token_generation_model/_tp0_bk2/compile_flags.MODULE_bac42b9b464c64624582+1ea12800.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=2", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk2/global_metric_store.json b/token_generation_model/_tp0_bk2/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..cb84a99c31f697b08f018053ff4192e5dc271abc --- /dev/null +++ b/token_generation_model/_tp0_bk2/global_metric_store.json @@ -0,0 +1,590 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.0271987915039, + "StaticProfiler::AveragePartitionUtilization": 91.27035522460938, + "StaticProfiler::AveragePeUtilization": 79.1353759765625, + "StaticProfiler::LocalizationEfficiency": 286.7578125, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 294.8894348144531, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 2.6242458820343018, + "AffinePredicateResolution": 0.059035539627075195, + "AliasDependencyElimination": 0.003847837448120117, + "AliasDependencyInduction": 0.8357675075531006, + "AliasDependencyReset": 0.8834974765777588, + "BFComputeCutting": 0.06842684745788574, + "BirCodeGenLoop": 1.8930256366729736, + "CCOpFusion": 0.7189147472381592, + "CanonicalizeConv": 0.0, + "CanonicalizeDAGForPGTiling": 0.1776437759399414, + "CanonicalizeForTensorizer": 0.0003809999907389283, + "CanonicalizeIR": 0.09505891799926758, + "Canonicalizer": 0.010010000318288803, + "CoalesceCCOp": 0.17792296409606934, + "CommuteConcat": 0.03131699562072754, + "DMALocalityOpt": 0.04498600959777832, + "DMAProfiler": 0.08704590797424316, + "DMATilingProfiler": 0.09701228141784668, + "DataLocalityOpt": 3.2508599758148193, + "DataStreaming": 0.17421460151672363, + "DeConcat": 0.06317973136901855, + "DeadCodeElimination": 0.033698320388793945, + "DeadStoreElimination": 1.1673948764801025, + "DelinearIndices": 0.44278812408447266, + "Delinearization": 0.1639111042022705, + "DelinearizeSPMD": 0.20702695846557617, + "DoNothing": 0.0003478527069091797, + "DramToDramTranspose": 0.33370137214660645, + "DumpGraphAndMetadata": 0.14346790313720703, + "EliminateDivs": 0.22263097763061523, + "ExpandBatchNorm": 0.09778141975402832, + "ExpandISAMacro": 0.09050297737121582, + "FactorizeBlkDims": 0.5268492698669434, + "FactorizeThreadAxesInFreeDims": 0.09685373306274414, + "FlattenMacroLoop": 0.10535311698913574, + "GenericAccessSimplifier": 0.029217004776000977, + "HoistCompute": 5.6000000768108293e-05, + "IdentifyCrossPassTensors": 0.0001049999991664663, + "InferInitValue": 1.3810255527496338, + "InferIntrinsicOnCC": 0.34633374214172363, + "InferNeuronTensor": 1.7193646430969238, + "InferNonlocalTensors": 5.612908363342285, + "InferPSumTensor": 1.3980116844177246, + "InferShardAxis": 9.386495590209961, + "InferSharedMemLoc": 0.12404179573059082, + "InlineNativeKernels": 0.05704975128173828, + "InsertCoreBarrier": 0.13544154167175293, + "InsertIOTransposes": 0.905651330947876, + "InsertImplicitShardAxisBeforeISel": 0.4292595386505127, + "InsertLocalTransposes": 0.7548105716705322, + "InsertOffloadedTransposes": 0.1284787654876709, + "LICM": 0.12886571884155273, + "LateLegalizeInst": 0.15713143348693848, + "LateLegalizePostSplit": 0.10522270202636719, + "LateLowerReshapeOp": 0.038214921951293945, + "LateLowerTensorOp": 0.42414045333862305, + "LateNeuronInstComb": 1.1582069396972656, + "LayoutPreprocessing": 0.9516875743865967, + "LayoutPreprocessingAndAnalysis": 1.4187979698181152, + "LayoutRequirementAnalysis": 0.45722126960754395, + "LegalizeCCOpLayout": 0.1329505443572998, + "LegalizeOpLevelAlias": 0.04250288009643555, + "LegalizePartitionReduce": 0.09100055694580078, + "LegalizeSundaAccess": 1.0071675777435303, + "LegalizeSundaMacro": 0.7127807140350342, + "LegalizeType": 0.14365863800048828, + "LocalLayoutOpt": 0.7256138324737549, + "LoopFusion": 0.3850085735321045, + "LoopSplitting": 0.030759811401367188, + "LowerBroadcast": 0.10816812515258789, + "LowerCCOpBlockAxis": 0.2317214012145996, + "LowerComplexBroadcast": 0.07975459098815918, + "LowerIntrinsics": 1.3126790523529053, + "LowerShardAxis": 0.2383556365966797, + "LowerTensorOp": 0.7480523586273193, + "LowerToSendRecv": 0.1662580966949463, + "LowerTranspose": 0.5189094543457031, + "MacroGeneration": 2.659961223602295, + "MaskPropagation": 0.11018657684326172, + "MemcastMotion": 0.00012099999730708078, + "MemcpyElimination": 9.414599418640137, + "MutateDataType": 0.06073474884033203, + "NeuronAliasDependencyInduction": 0.020488739013671875, + "NeuronAliasDependencyReset": 0.037583112716674805, + "NeuronInstComb": 0.4113731384277344, + "NeuronLICM": 0.289081335067749, + "NeuronLoopFusion": 1.6037812232971191, + "NeuronLoopInterchange": 0.0655665397644043, + "NeuronSimplifier": 0.5106499195098877, + "NeuronSimplifyPredicates": 0.2902688980102539, + "NeuronValueNumbering": 0.13138985633850098, + "OptimizeAliasedCopyChain": 0.028412580490112305, + "OptimizeNKIKernels": 1.5485177040100098, + "PAGLayoutOpt": 15.63675308227539, + "PComputeCutting": 0.3207705020904541, + "PGLayoutTilingPipeline": 41.22283172607422, + "PGTiling": 6.012831687927246, + "PadElimination": 0.017678499221801758, + "ParAxesAnnotation": 14.871472358703613, + "PartialLoopFusion": 1.256809949874878, + "PartialSimdFusion": 0.8388330936431885, + "PenguinizeFunctions": 0.00020700000459328294, + "PerfectLoopNest": 0.07282257080078125, + "PruneFunctions": 8.600000001024455e-05, + "RecognizeOpIdiom": 0.1260819435119629, + "Recompute": 0.009610652923583984, + "RelaxPredicates": 0.11840128898620605, + "Rematerialization": 0.16344213485717773, + "RemoveOptimizationBarriers": 7.100000220816582e-05, + "RemoveShardedPartitionAxes": 1.2759647369384766, + "ReshapeWeights": 0.03425192832946777, + "ResolveAccessConflict": 0.20044612884521484, + "ResolveComplicatePredicates": 0.07434439659118652, + "RewriteReplicationMatmul": 0.04415702819824219, + "RewriteWeights": 0.0949866771697998, + "SFKVectorizer": 6.701597213745117, + "ScatterMotion": 0.003252000082284212, + "ShardingPropagationAnalysis": 0.7332839965820313, + "SimpleAllReduceTiling": 0.07120895385742188, + "Simplifier": 0.1070256233215332, + "SimplifyMacroPredicates": 0.28906846046447754, + "SimplifyNeuronTensor": 0.4082615375518799, + "SimplifySlice": 0.03178524971008301, + "SimplifyTensor": 0.30094051361083984, + "SpillPSum": 0.7097611427307129, + "SplitAPUnionSets": 0.518521785736084, + "SplitAccGrp": 0.05238986015319824, + "StaticProfiler": 0.13858842849731445, + "StaticTransposeLocalTensor": 0.26463842391967773, + "SundaISel": 1.8421251773834229, + "TCTransform": 0.032810211181640625, + "TensorInitialization": 0.18349289894104004, + "TensorOpSimplifier": 0.8368067741394043, + "TensorOpTransform": 2.44799542427063, + "TensorizerLegalizationPass": 0.00019099999917671084, + "TileCCOps": 0.2656381130218506, + "TilingProfiler": 0.47725772857666016, + "TransformConvOp": 0.14513254165649414, + "TritiumFusion": 1.311755895614624, + "ValueNumbering": 0.10032439231872559, + "VectorizeDMA": 0.682180643081665, + "VectorizeMatMult": 0.07973051071166992, + "VerifySupportedOps": 0.0004149999876972288, + "WeightCoalescing": 0.064117431640625, + "ZeroSizeTensorElimination": 0.0003650188446044922, + "algsimp": 0.002326000016182661, + "batchnorm_expander": 0.0011060000397264957, + "boundary-marker-removal": 0.0004440000047907233, + "call-inliner": 0.0003169999981764704, + "canonicalize-boundary-marker": 0.0005729999975301325, + "collective-stream-id-checker": 0.00010800000018207356, + "comparison-expander": 0.0005629999795928597, + "computation-deduplicator": 0.0005300000193528831, + "config-lowering": 0.0005389999714680016, + "constant_folding": 0.0002809999859891832, + "cse": 0.0007670000195503235, + "dce": 4.5000000682193786e-05, + "dynamic-slice-transpose": 0.00026699999580159783, + "eliminate-redundant-compare": 0.0001900000061141327, + "emit-offloaded-dropout": 0.0004149999876972288, + "flatten-call-graph": 0.00044800000614486635, + "fuse-send-recv": 0.0022950000129640102, + "hilo-conditional-to-select": 0.0001289999927394092, + "hilo::LegalizeAlias": 0.0027550000231713057, + "hilo::NeuronInstCombine": 0.000846999988425523, + "hilo::NeuronOpFusion": 0.0002229999954579398, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00023799999326001853, + "hilo::ScheduleFusion": 3.300000025774352e-05, + "hilo::SixtyFourHack": 0.00027200000477023423, + "hilo::VerifyAliasing": 9.699999645818025e-05, + "hlo-mac-count": 0.005419999826699495, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001449999981559813, + "legalize-ccops-for-tensorizer": 2.499999936844688e-05, + "legalize-compare": 0.0006130000110715628, + "lower-argminmax-custom-call": 0.0002730000123847276, + "map-inline": 0.0008970000199042261, + "metadata-naming": 0.0014349999837577343, + "mlir::detail::OpToOpPassAdaptor": 0.000195999993593432, + "mlir::hlo::MhloToPyPenguin": 0.0739699974656105, + "mlir::mhlo::LowerComplexExtraPass": 0.0030060000717639923, + "mlir::mhlo::LowerComplexPass": 0.0021909999195486307, + "native-to-custom-softmax": 0.0005119999987073243, + "native-to-custom-softmax-dx": 0.0006050000083632767, + "neuron-hlo-verifier": 0.02477400004863739, + "operand_upcaster": 0.0010209999745711684, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06238299980759621, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0003100000030826777, + "reshape-mover": 0.00010099999781232327, + "simplify-concat": 0.002369999885559082, + "simplify-while-loops": 8.299999899463728e-05, + "transform-variadic-reduce": 0.0007149999728426337, + "tuple-simplifier": 0.00020399999630171806, + "unpack-nested-aws-ntwsr": 0.000539999979082495, + "unroll-while-loop": 1.1000000085914508e-05 + }, + "hilo": { + "HloMacCount": 7133691904.0, + "Traffic": 3915379456.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 99812, + "StaticProfiler::AifUb": 12.285852432250977, + "StaticProfiler::ArithmeticIntensityTensorizer": 35.23064041137695, + "StaticProfiler::AverageDmaLength": 3003.041015625, + "StaticProfiler::DDRTransferBytes": 2019175380, + "StaticProfiler::InternalTransferBytes": 456879296, + "StaticProfiler::LoadExpanded": 535846, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 544195, + "StaticProfiler::TotalDynamicInstancesCount": 125802, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 104329, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 71329, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 20017, + "TilingProfiler::PfTransposeInstructionsForIo": 16676, + "TilingProfiler::PfTransposeInstructionsForLocal": 1379, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 173, + "TilingProfiler::SimdInstructionsAfterTiling": 2796, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 0.0003809999907389283, + "Canonicalizer": 0.010010000318288803, + "HoistCompute": 5.6000000768108293e-05, + "IdentifyCrossPassTensors": 0.0001049999991664663, + "MemcastMotion": 0.00012099999730708078, + "PenguinizeFunctions": 0.00020700000459328294, + "PruneFunctions": 8.600000001024455e-05, + "RemoveOptimizationBarriers": 7.100000220816582e-05, + "ScatterMotion": 0.003252000082284212, + "TensorizerLegalizationPass": 0.00019099999917671084, + "VerifySupportedOps": 0.0004149999876972288, + "algsimp": 0.002326000016182661, + "batchnorm_expander": 0.0011060000397264957, + "boundary-marker-removal": 0.0004440000047907233, + "call-inliner": 0.0003169999981764704, + "canonicalize-boundary-marker": 0.0005729999975301325, + "collective-stream-id-checker": 0.00010800000018207356, + "comparison-expander": 0.0005629999795928597, + "computation-deduplicator": 0.0005300000193528831, + "config-lowering": 0.0005389999714680016, + "constant_folding": 0.0002809999859891832, + "cse": 0.0007670000195503235, + "dce": 4.5000000682193786e-05, + "dynamic-slice-transpose": 0.00026699999580159783, + "eliminate-redundant-compare": 0.0001900000061141327, + "emit-offloaded-dropout": 0.0004149999876972288, + "flatten-call-graph": 0.00044800000614486635, + "fuse-send-recv": 0.0022950000129640102, + "hilo-conditional-to-select": 0.0001289999927394092, + "hilo::LegalizeAlias": 0.0027550000231713057, + "hilo::NeuronInstCombine": 0.000846999988425523, + "hilo::NeuronOpFusion": 0.0002229999954579398, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00023799999326001853, + "hilo::ScheduleFusion": 3.300000025774352e-05, + "hilo::SixtyFourHack": 0.00027200000477023423, + "hilo::VerifyAliasing": 9.699999645818025e-05, + "hlo-mac-count": 0.005419999826699495, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.001449999981559813, + "legalize-ccops-for-tensorizer": 2.499999936844688e-05, + "legalize-compare": 0.0006130000110715628, + "lower-argminmax-custom-call": 0.0002730000123847276, + "map-inline": 0.0008970000199042261, + "metadata-naming": 0.0014349999837577343, + "mlir::detail::OpToOpPassAdaptor": 0.000195999993593432, + "mlir::hlo::MhloToPyPenguin": 0.0739699974656105, + "mlir::mhlo::LowerComplexExtraPass": 0.0030060000717639923, + "mlir::mhlo::LowerComplexPass": 0.0021909999195486307, + "native-to-custom-softmax": 0.0005119999987073243, + "native-to-custom-softmax-dx": 0.0006050000083632767, + "neuron-hlo-verifier": 0.02477400004863739, + "operand_upcaster": 0.0010209999745711684, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06238299980759621, + "pre-hlo-begin": 4.999999873689376e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0003100000030826777, + "reshape-mover": 0.00010099999781232327, + "simplify-concat": 0.002369999885559082, + "simplify-while-loops": 8.299999899463728e-05, + "transform-variadic-reduce": 0.0007149999728426337, + "tuple-simplifier": 0.00020399999630171806, + "unpack-nested-aws-ntwsr": 0.000539999979082495, + "unroll-while-loop": 1.1000000085914508e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0002779960632324219, + "DMALocalityOpt": 0.00020742416381835938, + "DMAProfiler": 0.0007307529449462891, + "DataStreaming": 0.0003261566162109375, + "DoNothing": 0.00012183189392089844, + "ExpandISAMacro": 0.0006313323974609375, + "FactorizeBlkDims": 0.0004703998565673828, + "InferPSumTensor": 0.0005621910095214844, + "InferSharedMemLoc": 0.0003247261047363281, + "InsertCoreBarrier": 0.0003006458282470703, + "LateLegalizeInst": 0.0004780292510986328, + "LateNeuronInstComb": 0.0006775856018066406, + "LegalizeSundaAccess": 0.0015130043029785156, + "LegalizeType": 0.00026535987854003906, + "LowerBroadcast": 0.0002560615539550781, + "LowerIntrinsics": 0.00024080276489257813, + "LowerTranspose": 0.00029087066650390625, + "NeuronInstComb": 0.0006630420684814453, + "NeuronLICM": 0.0004336833953857422, + "NeuronSimplifyPredicates": 0.002326488494873047, + "NeuronValueNumbering": 0.0004627704620361328, + "SFKVectorizer": 0.0028204917907714844, + "SimpleAllReduceTiling": 0.00021076202392578125, + "SimplifyNeuronTensor": 0.000614166259765625, + "SpillPSum": 0.000591278076171875, + "WeightCoalescing": 0.00021719932556152344 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 3.6439340114593506, + "HloMacCount": 7133691904.0, + "Traffic": 3915379456.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 2.6242458820343018, + "AffinePredicateResolution": 0.059035539627075195, + "AliasDependencyElimination": 0.003847837448120117, + "AliasDependencyInduction": 0.8357675075531006, + "AliasDependencyReset": 0.8834974765777588, + "BFComputeCutting": 0.06842684745788574, + "BirCodeGenLoop": 1.8930256366729736, + "CCOpFusion": 0.7189147472381592, + "CanonicalizeDAGForPGTiling": 0.1776437759399414, + "CanonicalizeIR": 0.09505891799926758, + "CoalesceCCOp": 0.17501068115234375, + "CommuteConcat": 0.03131699562072754, + "DMALocalityOpt": 0.04282784461975098, + "DMAProfiler": 0.08357381820678711, + "DMATilingProfiler": 0.09701228141784668, + "DataLocalityOpt": 3.2508599758148193, + "DataStreaming": 0.16964387893676758, + "DeConcat": 0.06317973136901855, + "DeadCodeElimination": 0.033698320388793945, + "DeadStoreElimination": 1.1673948764801025, + "DelinearIndices": 0.44278812408447266, + "Delinearization": 0.1639111042022705, + "DelinearizeSPMD": 0.20702695846557617, + "DoNothing": 6.318092346191406e-05, + "DramToDramTranspose": 0.33370137214660645, + "DumpGraphAndMetadata": 0.14346790313720703, + "EliminateDivs": 0.22263097763061523, + "ExpandBatchNorm": 0.09778141975402832, + "ExpandISAMacro": 0.08714938163757324, + "FactorizeBlkDims": 0.5179133415222168, + "FactorizeThreadAxesInFreeDims": 0.09685373306274414, + "FlattenMacroLoop": 0.10535311698913574, + "GenericAccessSimplifier": 0.029217004776000977, + "InferInitValue": 1.3810255527496338, + "InferIntrinsicOnCC": 0.34633374214172363, + "InferNeuronTensor": 1.7193646430969238, + "InferNonlocalTensors": 5.612908363342285, + "InferPSumTensor": 1.3907256126403809, + "InferShardAxis": 9.386495590209961, + "InferSharedMemLoc": 0.12169718742370605, + "InlineNativeKernels": 0.05704975128173828, + "InsertCoreBarrier": 0.1328897476196289, + "InsertIOTransposes": 0.905651330947876, + "InsertImplicitShardAxisBeforeISel": 0.4292595386505127, + "InsertLocalTransposes": 0.7548105716705322, + "InsertOffloadedTransposes": 0.1284787654876709, + "LICM": 0.12886571884155273, + "LateLegalizeInst": 0.1520097255706787, + "LateLegalizePostSplit": 0.10522270202636719, + "LateLowerReshapeOp": 0.038214921951293945, + "LateLowerTensorOp": 0.42414045333862305, + "LateNeuronInstComb": 1.1515285968780518, + "LayoutPreprocessing": 0.9516875743865967, + "LayoutPreprocessingAndAnalysis": 1.4187979698181152, + "LayoutRequirementAnalysis": 0.45722126960754395, + "LegalizeCCOpLayout": 0.1329505443572998, + "LegalizeOpLevelAlias": 0.04250288009643555, + "LegalizePartitionReduce": 0.09100055694580078, + "LegalizeSundaAccess": 0.9970362186431885, + "LegalizeSundaMacro": 0.7127807140350342, + "LegalizeType": 0.13750338554382324, + "LocalLayoutOpt": 0.7256138324737549, + "LoopFusion": 0.3850085735321045, + "LoopSplitting": 0.030759811401367188, + "LowerBroadcast": 0.10549783706665039, + "LowerCCOpBlockAxis": 0.2317214012145996, + "LowerComplexBroadcast": 0.07975459098815918, + "LowerIntrinsics": 1.3098361492156982, + "LowerShardAxis": 0.2383556365966797, + "LowerTensorOp": 0.7480523586273193, + "LowerToSendRecv": 0.1662580966949463, + "LowerTranspose": 0.5161962509155273, + "MacroGeneration": 2.659961223602295, + "MaskPropagation": 0.11018657684326172, + "MemcpyElimination": 9.414599418640137, + "MutateDataType": 0.06073474884033203, + "NeuronAliasDependencyInduction": 0.020488739013671875, + "NeuronAliasDependencyReset": 0.037583112716674805, + "NeuronInstComb": 0.4045867919921875, + "NeuronLICM": 0.2813999652862549, + "NeuronLoopFusion": 1.6037812232971191, + "NeuronLoopInterchange": 0.0655665397644043, + "NeuronSimplifier": 0.5106499195098877, + "NeuronSimplifyPredicates": 0.2850377559661865, + "NeuronValueNumbering": 0.12716984748840332, + "OptimizeAliasedCopyChain": 0.028412580490112305, + "OptimizeNKIKernels": 1.5485177040100098, + "PAGLayoutOpt": 15.63675308227539, + "PComputeCutting": 0.3207705020904541, + "PGLayoutTilingPipeline": 41.22283172607422, + "PGTiling": 6.012831687927246, + "PadElimination": 0.017678499221801758, + "ParAxesAnnotation": 14.871472358703613, + "PartialLoopFusion": 1.256809949874878, + "PartialSimdFusion": 0.8388330936431885, + "PerfectLoopNest": 0.07282257080078125, + "RecognizeOpIdiom": 0.1260819435119629, + "Recompute": 0.009610652923583984, + "RelaxPredicates": 0.11840128898620605, + "Rematerialization": 0.16344213485717773, + "RemoveShardedPartitionAxes": 1.2759647369384766, + "ReshapeWeights": 0.03425192832946777, + "ResolveAccessConflict": 0.20044612884521484, + "ResolveComplicatePredicates": 0.07434439659118652, + "RewriteReplicationMatmul": 0.04415702819824219, + "RewriteWeights": 0.0949866771697998, + "SFKVectorizer": 6.678906440734863, + "ShardingPropagationAnalysis": 0.7332839965820313, + "SimpleAllReduceTiling": 0.06847286224365234, + "Simplifier": 0.1070256233215332, + "SimplifyMacroPredicates": 0.28906846046447754, + "SimplifyNeuronTensor": 0.3596303462982178, + "SimplifySlice": 0.03178524971008301, + "SimplifyTensor": 0.30094051361083984, + "SpillPSum": 0.6965179443359375, + "SplitAPUnionSets": 0.518521785736084, + "SplitAccGrp": 0.05238986015319824, + "StaticProfiler": 0.13858842849731445, + "StaticTransposeLocalTensor": 0.26463842391967773, + "SundaISel": 1.8421251773834229, + "TCTransform": 0.032810211181640625, + "TensorInitialization": 0.18349289894104004, + "TensorOpSimplifier": 0.8368067741394043, + "TensorOpTransform": 2.44799542427063, + "TileCCOps": 0.2656381130218506, + "TilingProfiler": 0.47725772857666016, + "TransformConvOp": 0.14513254165649414, + "TritiumFusion": 1.311755895614624, + "ValueNumbering": 0.10032439231872559, + "VectorizeDMA": 0.682180643081665, + "VectorizeMatMult": 0.07973051071166992, + "WeightCoalescing": 0.06142139434814453, + "ZeroSizeTensorElimination": 0.0003650188446044922 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 99812, + "StaticProfiler::AifUb": 12.285852432250977, + "StaticProfiler::ArithmeticIntensityTensorizer": 35.23064041137695, + "StaticProfiler::AverageDmaLength": 3003.041015625, + "StaticProfiler::AverageFractalPeUtilization": 98.0271987915039, + "StaticProfiler::AveragePartitionUtilization": 91.27035522460938, + "StaticProfiler::AveragePeUtilization": 79.1353759765625, + "StaticProfiler::DDRTransferBytes": 2019175380, + "StaticProfiler::InternalTransferBytes": 456879296, + "StaticProfiler::LoadExpanded": 535846, + "StaticProfiler::LocalizationEfficiency": 286.7578125, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 294.8894348144531, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 544195, + "StaticProfiler::TotalDynamicInstancesCount": 125802, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 104329, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 71329, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 20017, + "TilingProfiler::PfTransposeInstructionsForIo": 16676, + "TilingProfiler::PfTransposeInstructionsForLocal": 1379, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 173, + "TilingProfiler::SimdInstructionsAfterTiling": 2796, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.002634286880493164, + "DMALocalityOpt": 0.0019507408142089844, + "DMAProfiler": 0.0027413368225097656, + "DataStreaming": 0.004244565963745117, + "DoNothing": 0.0001628398895263672, + "ExpandISAMacro": 0.0027222633361816406, + "FactorizeBlkDims": 0.00846552848815918, + "InferPSumTensor": 0.006723880767822266, + "InferSharedMemLoc": 0.0020198822021484375, + "InsertCoreBarrier": 0.002251148223876953, + "LateLegalizeInst": 0.004643678665161133, + "LateNeuronInstComb": 0.0060007572174072266, + "LegalizeSundaAccess": 0.008618354797363281, + "LegalizeType": 0.005889892578125, + "LowerBroadcast": 0.002414226531982422, + "LowerIntrinsics": 0.002602100372314453, + "LowerTranspose": 0.002422332763671875, + "NeuronInstComb": 0.00612330436706543, + "NeuronLICM": 0.0072476863861083984, + "NeuronSimplifyPredicates": 0.002904653549194336, + "NeuronValueNumbering": 0.0037572383880615234, + "SFKVectorizer": 0.01987004280090332, + "SimpleAllReduceTiling": 0.00252532958984375, + "SimplifyNeuronTensor": 0.048017024993896484, + "SpillPSum": 0.012651920318603516, + "WeightCoalescing": 0.0024788379669189453 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk2/graph.neff b/token_generation_model/_tp0_bk2/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..4944196ff04153b916e82be2739fc0e097fb718a --- /dev/null +++ b/token_generation_model/_tp0_bk2/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0074929c6140282979070bdfc412360950dfa5e2dbe995e0afa9cc7a6661e809 +size 6657024 diff --git a/token_generation_model/_tp0_bk2/log-neuron-cc.txt b/token_generation_model/_tp0_bk2/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb895c54de1994b5fc6b4cbc1b749249b61e9f86 --- /dev/null +++ b/token_generation_model/_tp0_bk2/log-neuron-cc.txt @@ -0,0 +1,4599 @@ +2025-11-04T21:38:36Z INFO 8794 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:36Z INFO 8794 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:36Z INFO 8867 [root]: XLA detected +2025-11-04T21:38:36Z INFO 8867 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:36Z INFO 8867 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2 +2025-11-04T21:38:36Z INFO 8867 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:36Z INFO 8867 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8867 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:36Z INFO 8867 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:36Z INFO 8867 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:36Z INFO 8867 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8867 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:37Z INFO 8867 [job.HLOToTensorizer.0]: Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate parameter reduce reshape rng scatter select sine slice subtract transpose tuple +2025-11-04 21:38:37.048155: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:88] Could not open file debug_info_hlo_partitions.json +2025-11-04 21:38:37.062249: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.10701 = tuple(%reshape.4385, %scatter.9929, %scatter.9944, %scatter.9957, %scatter.9972, %scatter.9985, %scatter.10000, %scatter.10013, %scatter.10028, %scatter.10041, %scatter.10056, %scatter.10069, %scatter.10084, %scatter.10097, %scatter.10112, %scatter.10125, %scatter.10140, %scatter.10153, %scatter.10168, %scatter.10181, %scatter.10196, %scatter.10209, %scatter.10224, %scatter.10237, %scatter.10252, %scatter.10265, %scatter.10280, %scatter.10293, %scatter.10308, %scatter.10321, %scatter.10336... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:37Z INFO 8867 [job.HLOToTensorizer.0]: IR signature: 18d343f702d85e11e14bff82308918a6b7ba71671aad956bee63a8b88c891ab8 for sg0000/HLOToTensorizer +2025-11-04T21:38:37Z INFO 8867 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:37Z INFO 8867 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:37Z INFO 8867 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:37Z INFO 8867 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:37Z INFO 8867 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:37Z INFO 8867 [job.Frontend.0]: Start model loading +2025-11-04T21:38:37Z INFO 8867 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:37Z INFO 8867 [job.Frontend.0]: Num jobs: 1 +2025-11-04T21:38:37Z USER 8867 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:37Z INFO 8867 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-11-04T21:38:37Z INFO 8867 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-11-04T21:38:39Z INFO 8867 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.043 seconds +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.028 seconds +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.157 seconds +2025-11-04T21:38:39Z INFO 8867 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.145 seconds +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.748 seconds +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.023 seconds +2025-11-04T21:38:40Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:41Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.541 seconds +2025-11-04T21:38:41Z INFO 8867 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.591 seconds +2025-11-04T21:38:41Z INFO 8867 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:41Z INFO 8867 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8867 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.133 seconds +2025-11-04T21:38:41Z INFO 8867 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.837 seconds +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.095 seconds +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.074 seconds +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.059 seconds +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.223 seconds +2025-11-04T21:38:42Z INFO 8867 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.074 seconds +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.331 seconds +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.333 seconds +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.254 seconds +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.921 seconds +2025-11-04T21:38:43Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.064 seconds +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.059 seconds +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.067 seconds +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.067 seconds +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.098 seconds +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.077 seconds +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.057 seconds +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:44Z INFO 8867 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:46Z INFO 8867 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 2.108 seconds +2025-11-04T21:38:46Z INFO 8867 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:46Z INFO 8867 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.338 seconds +2025-11-04T21:38:46Z INFO 8867 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8867 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 2.448 seconds +2025-11-04T21:38:46Z INFO 8867 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:47Z INFO 8867 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8867 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.424 seconds +2025-11-04T21:38:47Z INFO 8867 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:47Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:47Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.014 seconds +2025-11-04T21:38:47Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:48Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:48Z INFO 8867 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.836 seconds +2025-11-04T21:38:48Z INFO 8867 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.883 seconds +2025-11-04T21:38:48Z INFO 8867 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:48Z INFO 8867 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:57Z INFO 8867 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 9.121 seconds +2025-11-04T21:38:57Z INFO 8867 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:57Z INFO 8867 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.292 seconds +2025-11-04T21:38:57Z INFO 8867 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:57Z INFO 8867 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 9.415 seconds +2025-11-04T21:38:57Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:57Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:58Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.951 seconds +2025-11-04T21:38:58Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:59Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.440 seconds +2025-11-04T21:38:59Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:59Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.335 seconds +2025-11-04T21:38:59Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.617 seconds +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.442 seconds +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.823 seconds +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.163 seconds +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.148 seconds +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.108 seconds +2025-11-04T21:39:00Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:39:01Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.102 seconds +2025-11-04T21:39:01Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:01Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.360 seconds +2025-11-04T21:39:01Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:01Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:39:01Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.638 seconds +2025-11-04T21:39:01Z INFO 8867 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.703 seconds +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.119 seconds +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.121 seconds +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/LICM]: LICM finished after 0.072 seconds +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.169 seconds +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:03Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.203 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.121 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.259 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.590 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.032 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LICM]: LICM finished after 0.062 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.111 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.103 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.214 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.125 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/LICM]: LICM finished after 0.056 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.018 seconds +2025-11-04T21:39:04Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.149 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.184 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.194 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.385 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.029 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.102 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.102 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/LICM]: LICM finished after 0.055 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.100 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.033 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.031 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.031 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.126 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.126 seconds +2025-11-04T21:39:05Z INFO 8867 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:06Z INFO 8867 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8867 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.095 seconds +2025-11-04T21:39:06Z INFO 8867 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.167 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.010 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.032 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.032 seconds +2025-11-04T21:39:07Z INFO 8867 [Tensorizer]: After optimization: 958 statements +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.061 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.029 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.107 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.107 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=32768 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 8) %'all_gather.1' = AllGatherOp-402 AllGather_add(bfloat16 (1024, 8) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.50 | hlo_id: 50 | , id = 402 +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: in float32 (512, 8) %'all_gather.2' = AllGatherOp-9247 AllGather_add(float32 (256, 8) %'transpose.537', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9754 | hlo_id: 9754 | , id = 9247 +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=16384 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: in uint32 (512, 8) %'all_gather.3' = AllGatherOp-9263 AllGather_add(uint32 (256, 8) %'transpose.538', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9893 | hlo_id: 9893 | , id = 9263 +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.266 seconds +2025-11-04T21:39:07Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.550 seconds +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.137 seconds +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.342 seconds +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.034 seconds +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.038 seconds +2025-11-04T21:39:08Z INFO 8867 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.346 seconds +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_1 +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_1 finished after 0.030 seconds +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.200 seconds +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/LICM]: LICM finished after 0.060 seconds +2025-11-04T21:39:09Z INFO 8867 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.726 seconds +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.364 seconds +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.248 seconds +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:39:10Z INFO 8867 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:39:11Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:11Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:11Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.136 seconds +2025-11-04T21:39:11Z INFO 8867 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:39:11Z INFO 8867 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.952 seconds +2025-11-04T21:39:11Z INFO 8867 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:39:12Z INFO 8867 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.457 seconds +2025-11-04T21:39:12Z INFO 8867 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.419 seconds +2025-11-04T21:39:12Z INFO 8867 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:39:12Z INFO 8867 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:13Z INFO 8867 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:17Z INFO 8867 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:39:17Z INFO 8867 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 5.613 seconds +2025-11-04T21:39:17Z INFO 8867 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:39:17Z INFO 8867 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:39:18Z INFO 8867 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:39:32Z INFO 8867 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:39:32Z INFO 8867 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 14.871 seconds +2025-11-04T21:39:32Z INFO 8867 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.755 seconds +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 15.637 seconds +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.164 seconds +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.207 seconds +2025-11-04T21:39:33Z INFO 8867 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:39:34Z INFO 8867 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.733 seconds +2025-11-04T21:39:34Z INFO 8867 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:39:41Z INFO 8867 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:39:41Z INFO 8867 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 1159 +total number of sharded dags: 408 + +total bytes transferred from input, output, non local tensors: 1962371400 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 1959739524 +% bytes transferred with 2x bandwidths: 99.87 + +NC0 FLOPs: 7131237529 +NC1 FLOPs: 7123898240 +% FLOPs sharded: 99.94 + + +Shard dim: 512, Number of dags: 198 +Matmuls sharded with this dim: +[2,2,64] @ [2,64,512(s)] = [2,512(s)] Number of occurrences: 28 +[2,512(s)] @ [512(s),128] = [2,128] Number of occurrences: 28 + + +Shard dim: 2, Number of dags: 196 +Matmuls sharded with this dim: +[8,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [8,8,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,128] = [8,2,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,2,64] = [8,2,2,2,2,64] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,64] = [8,2,2,2,64] Number of occurrences: 28 +[8,2,2,2,128] @ [2,2,2,128,2(s),2,4,128] = [8,2(s),2,4,128] Number of occurrences: 28 +[8,2,8,128] @ [2,8,128,2(s),6,2,128] = [8,2(s),6,2,128] Number of occurrences: 56 + + +Shard dim: 256, Number of dags: 10 +Matmuls sharded with this dim: + + +Shard dim: 8, Number of dags: 2 +Matmuls sharded with this dim: + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[8,2,8,128] @ [2,8,128,75968(s)] = [8,75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:39:42Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:42Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:42Z INFO 8867 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.443 seconds +2025-11-04T21:39:42Z INFO 8867 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:39:43Z INFO 8867 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8867 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 1.276 seconds +2025-11-04T21:39:43Z INFO 8867 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8867 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 9.386 seconds +2025-11-04T21:39:43Z INFO 8867 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.110 seconds +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.178 seconds +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.232 seconds +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:39:44Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (79, 'AG3736'), (80, 'AG3735'), (218, 'AG3727'), (474, 'AG3726'), (274, 'AG3733')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9810 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (83, 'AG3752'), (84, 'AG3751'), (218, 'AG3727'), (474, 'AG3726'), (272, 'AG3749')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10061 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (87, 'AG3768'), (88, 'AG3767'), (218, 'AG3727'), (474, 'AG3726'), (270, 'AG3765')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10312 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (91, 'AG3784'), (92, 'AG3783'), (218, 'AG3727'), (474, 'AG3726'), (268, 'AG3781')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (95, 'AG3800'), (96, 'AG3799'), (218, 'AG3727'), (474, 'AG3726'), (266, 'AG3797')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10814 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (99, 'AG3816'), (100, 'AG3815'), (218, 'AG3727'), (474, 'AG3726'), (264, 'AG3813')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11065 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (103, 'AG3832'), (104, 'AG3831'), (218, 'AG3727'), (474, 'AG3726'), (262, 'AG3829')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11316 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (107, 'AG3848'), (108, 'AG3847'), (218, 'AG3727'), (474, 'AG3726'), (260, 'AG3845')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (111, 'AG3864'), (112, 'AG3863'), (218, 'AG3727'), (474, 'AG3726'), (258, 'AG3861')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11818 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (115, 'AG3880'), (116, 'AG3879'), (218, 'AG3727'), (474, 'AG3726'), (256, 'AG3877')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12069 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (119, 'AG3896'), (120, 'AG3895'), (218, 'AG3727'), (474, 'AG3726'), (254, 'AG3893')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12320 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (123, 'AG3912'), (124, 'AG3911'), (218, 'AG3727'), (474, 'AG3726'), (252, 'AG3909')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12571 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (127, 'AG3928'), (128, 'AG3927'), (218, 'AG3727'), (474, 'AG3726'), (250, 'AG3925')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12822 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (131, 'AG3944'), (132, 'AG3943'), (218, 'AG3727'), (474, 'AG3726'), (248, 'AG3941')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13073 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (135, 'AG3960'), (136, 'AG3959'), (218, 'AG3727'), (474, 'AG3726'), (246, 'AG3957')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13324 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (139, 'AG3976'), (140, 'AG3975'), (218, 'AG3727'), (474, 'AG3726'), (244, 'AG3973')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13575 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (143, 'AG3992'), (144, 'AG3991'), (218, 'AG3727'), (474, 'AG3726'), (242, 'AG3989')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13826 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (147, 'AG4008'), (148, 'AG4007'), (218, 'AG3727'), (474, 'AG3726'), (240, 'AG4005')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14077 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (151, 'AG4024'), (152, 'AG4023'), (218, 'AG3727'), (474, 'AG3726'), (238, 'AG4021')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14328 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (155, 'AG4040'), (156, 'AG4039'), (218, 'AG3727'), (474, 'AG3726'), (236, 'AG4037')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (159, 'AG4056'), (160, 'AG4055'), (218, 'AG3727'), (474, 'AG3726'), (234, 'AG4053')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14830 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (163, 'AG4072'), (164, 'AG4071'), (218, 'AG3727'), (474, 'AG3726'), (232, 'AG4069')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15081 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (167, 'AG4088'), (168, 'AG4087'), (218, 'AG3727'), (474, 'AG3726'), (230, 'AG4085')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15332 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (171, 'AG4104'), (172, 'AG4103'), (218, 'AG3727'), (474, 'AG3726'), (228, 'AG4101')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (175, 'AG4120'), (176, 'AG4119'), (218, 'AG3727'), (474, 'AG3726'), (226, 'AG4117')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15834 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (179, 'AG4136'), (180, 'AG4135'), (218, 'AG3727'), (474, 'AG3726'), (224, 'AG4133')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16085 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (183, 'AG4152'), (184, 'AG4151'), (218, 'AG3727'), (474, 'AG3726'), (222, 'AG4149')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16336 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(8, 2, 2, 8, 2, 256, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (187, 'AG4168'), (188, 'AG4167'), (218, 'AG3727'), (474, 'AG3726'), (220, 'AG4165')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23397 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(78, 'AG3741'), (273, 'AG3740'), (79, 'AG3736'), (80, 'AG3735'), (81, 'AG3734'), (358, 'AG3739'), (470, 'AG3738')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (471, 'AG3737')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23387 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(79, 'AG3736'), (191, 'AG3731'), (80, 'AG3735'), (81, 'AG3734'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(469, 'AG3742'), (74, 'AG3744'), (357, 'AG3743')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(37, 'AG3748'), (1, 'AG3745'), (356, 'AG3747'), (468, 'AG3746')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23400 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23398 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23399 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23412 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(82, 'AG3757'), (271, 'AG3756'), (83, 'AG3752'), (84, 'AG3751'), (85, 'AG3750'), (355, 'AG3755'), (466, 'AG3754')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (467, 'AG3753')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23402 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(83, 'AG3752'), (191, 'AG3731'), (84, 'AG3751'), (85, 'AG3750'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(465, 'AG3758'), (75, 'AG3760'), (354, 'AG3759')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(38, 'AG3764'), (2, 'AG3761'), (353, 'AG3763'), (464, 'AG3762')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23415 of IO tensor {'CrossPassTensor': ''}bfloat16 %input80|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23413 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23414 of IO tensor {'CrossPassTensor': ''}bfloat16 %input82|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23427 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(86, 'AG3773'), (269, 'AG3772'), (87, 'AG3768'), (88, 'AG3767'), (89, 'AG3766'), (352, 'AG3771'), (462, 'AG3770')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (463, 'AG3769')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23417 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(87, 'AG3768'), (191, 'AG3731'), (88, 'AG3767'), (89, 'AG3766'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(461, 'AG3774'), (76, 'AG3776'), (351, 'AG3775')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(39, 'AG3780'), (3, 'AG3777'), (350, 'AG3779'), (460, 'AG3778')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23430 of IO tensor {'CrossPassTensor': ''}bfloat16 %input91|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23428 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23429 of IO tensor {'CrossPassTensor': ''}bfloat16 %input93|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(90, 'AG3789'), (267, 'AG3788'), (91, 'AG3784'), (92, 'AG3783'), (93, 'AG3782'), (349, 'AG3787'), (458, 'AG3786')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (459, 'AG3785')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23432 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(91, 'AG3784'), (191, 'AG3731'), (92, 'AG3783'), (93, 'AG3782'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(457, 'AG3790'), (192, 'AG3792'), (348, 'AG3791')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(40, 'AG3796'), (4, 'AG3793'), (347, 'AG3795'), (456, 'AG3794')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23457 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(94, 'AG3805'), (265, 'AG3804'), (95, 'AG3800'), (96, 'AG3799'), (97, 'AG3798'), (346, 'AG3803'), (454, 'AG3802')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (455, 'AG3801')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(95, 'AG3800'), (191, 'AG3731'), (96, 'AG3799'), (97, 'AG3798'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(453, 'AG3806'), (193, 'AG3808'), (345, 'AG3807')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(41, 'AG3812'), (5, 'AG3809'), (344, 'AG3811'), (452, 'AG3810')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23472 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(98, 'AG3821'), (263, 'AG3820'), (99, 'AG3816'), (100, 'AG3815'), (101, 'AG3814'), (343, 'AG3819'), (450, 'AG3818')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (451, 'AG3817')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23462 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(99, 'AG3816'), (191, 'AG3731'), (100, 'AG3815'), (101, 'AG3814'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(449, 'AG3822'), (194, 'AG3824'), (342, 'AG3823')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(42, 'AG3828'), (6, 'AG3825'), (341, 'AG3827'), (448, 'AG3826')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23473 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23487 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(102, 'AG3837'), (261, 'AG3836'), (103, 'AG3832'), (104, 'AG3831'), (105, 'AG3830'), (340, 'AG3835'), (446, 'AG3834')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (447, 'AG3833')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23477 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(103, 'AG3832'), (191, 'AG3731'), (104, 'AG3831'), (105, 'AG3830'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(445, 'AG3838'), (195, 'AG3840'), (339, 'AG3839')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(43, 'AG3844'), (7, 'AG3841'), (338, 'AG3843'), (444, 'AG3842')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23488 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23502 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(106, 'AG3853'), (259, 'AG3852'), (107, 'AG3848'), (108, 'AG3847'), (109, 'AG3846'), (337, 'AG3851'), (442, 'AG3850')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (443, 'AG3849')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23492 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(107, 'AG3848'), (191, 'AG3731'), (108, 'AG3847'), (109, 'AG3846'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(441, 'AG3854'), (196, 'AG3856'), (336, 'AG3855')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(44, 'AG3860'), (8, 'AG3857'), (335, 'AG3859'), (440, 'AG3858')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23503 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23517 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(110, 'AG3869'), (257, 'AG3868'), (111, 'AG3864'), (112, 'AG3863'), (113, 'AG3862'), (334, 'AG3867'), (438, 'AG3866')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (439, 'AG3865')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23507 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(111, 'AG3864'), (191, 'AG3731'), (112, 'AG3863'), (113, 'AG3862'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(437, 'AG3870'), (197, 'AG3872'), (333, 'AG3871')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG3876'), (9, 'AG3873'), (332, 'AG3875'), (436, 'AG3874')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23518 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23532 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(114, 'AG3885'), (255, 'AG3884'), (115, 'AG3880'), (116, 'AG3879'), (117, 'AG3878'), (331, 'AG3883'), (434, 'AG3882')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (435, 'AG3881')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23522 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(115, 'AG3880'), (191, 'AG3731'), (116, 'AG3879'), (117, 'AG3878'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(433, 'AG3886'), (198, 'AG3888'), (330, 'AG3887')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(46, 'AG3892'), (10, 'AG3889'), (329, 'AG3891'), (432, 'AG3890')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23533 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23547 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(118, 'AG3901'), (253, 'AG3900'), (119, 'AG3896'), (120, 'AG3895'), (121, 'AG3894'), (328, 'AG3899'), (430, 'AG3898')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (431, 'AG3897')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23537 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(119, 'AG3896'), (191, 'AG3731'), (120, 'AG3895'), (121, 'AG3894'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(429, 'AG3902'), (199, 'AG3904'), (327, 'AG3903')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(47, 'AG3908'), (11, 'AG3905'), (326, 'AG3907'), (428, 'AG3906')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23548 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(122, 'AG3917'), (251, 'AG3916'), (123, 'AG3912'), (124, 'AG3911'), (125, 'AG3910'), (325, 'AG3915'), (426, 'AG3914')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (427, 'AG3913')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23552 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(123, 'AG3912'), (191, 'AG3731'), (124, 'AG3911'), (125, 'AG3910'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(425, 'AG3918'), (200, 'AG3920'), (324, 'AG3919')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(48, 'AG3924'), (12, 'AG3921'), (323, 'AG3923'), (424, 'AG3922')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23577 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(126, 'AG3933'), (249, 'AG3932'), (127, 'AG3928'), (128, 'AG3927'), (129, 'AG3926'), (322, 'AG3931'), (422, 'AG3930')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (423, 'AG3929')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(127, 'AG3928'), (191, 'AG3731'), (128, 'AG3927'), (129, 'AG3926'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(421, 'AG3934'), (201, 'AG3936'), (321, 'AG3935')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(49, 'AG3940'), (13, 'AG3937'), (320, 'AG3939'), (420, 'AG3938')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23578 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(130, 'AG3949'), (247, 'AG3948'), (131, 'AG3944'), (132, 'AG3943'), (133, 'AG3942'), (319, 'AG3947'), (418, 'AG3946')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (419, 'AG3945')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23582 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG3944'), (191, 'AG3731'), (132, 'AG3943'), (133, 'AG3942'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(417, 'AG3950'), (202, 'AG3952'), (318, 'AG3951')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(50, 'AG3956'), (14, 'AG3953'), (317, 'AG3955'), (416, 'AG3954')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23607 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(134, 'AG3965'), (245, 'AG3964'), (135, 'AG3960'), (136, 'AG3959'), (137, 'AG3958'), (316, 'AG3963'), (414, 'AG3962')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (415, 'AG3961')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23597 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(135, 'AG3960'), (191, 'AG3731'), (136, 'AG3959'), (137, 'AG3958'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(413, 'AG3966'), (203, 'AG3968'), (315, 'AG3967')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(51, 'AG3972'), (15, 'AG3969'), (314, 'AG3971'), (412, 'AG3970')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23608 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23622 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(138, 'AG3981'), (243, 'AG3980'), (139, 'AG3976'), (140, 'AG3975'), (141, 'AG3974'), (313, 'AG3979'), (410, 'AG3978')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (411, 'AG3977')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23612 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(139, 'AG3976'), (191, 'AG3731'), (140, 'AG3975'), (141, 'AG3974'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(409, 'AG3982'), (204, 'AG3984'), (312, 'AG3983')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(52, 'AG3988'), (16, 'AG3985'), (311, 'AG3987'), (408, 'AG3986')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23623 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(142, 'AG3997'), (241, 'AG3996'), (143, 'AG3992'), (144, 'AG3991'), (145, 'AG3990'), (310, 'AG3995'), (406, 'AG3994')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (407, 'AG3993')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23627 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(143, 'AG3992'), (191, 'AG3731'), (144, 'AG3991'), (145, 'AG3990'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(405, 'AG3998'), (205, 'AG4000'), (309, 'AG3999')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(53, 'AG4004'), (17, 'AG4001'), (308, 'AG4003'), (404, 'AG4002')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23652 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(146, 'AG4013'), (239, 'AG4012'), (147, 'AG4008'), (148, 'AG4007'), (149, 'AG4006'), (307, 'AG4011'), (402, 'AG4010')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (403, 'AG4009')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23642 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(147, 'AG4008'), (191, 'AG3731'), (148, 'AG4007'), (149, 'AG4006'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(401, 'AG4014'), (206, 'AG4016'), (306, 'AG4015')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(54, 'AG4020'), (18, 'AG4017'), (305, 'AG4019'), (400, 'AG4018')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23667 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(150, 'AG4029'), (237, 'AG4028'), (151, 'AG4024'), (152, 'AG4023'), (153, 'AG4022'), (304, 'AG4027'), (398, 'AG4026')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (399, 'AG4025')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(151, 'AG4024'), (191, 'AG3731'), (152, 'AG4023'), (153, 'AG4022'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(397, 'AG4030'), (207, 'AG4032'), (303, 'AG4031')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(55, 'AG4036'), (19, 'AG4033'), (302, 'AG4035'), (396, 'AG4034')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23668 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23682 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(154, 'AG4045'), (235, 'AG4044'), (155, 'AG4040'), (156, 'AG4039'), (157, 'AG4038'), (301, 'AG4043'), (394, 'AG4042')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (395, 'AG4041')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(155, 'AG4040'), (191, 'AG3731'), (156, 'AG4039'), (157, 'AG4038'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(393, 'AG4046'), (208, 'AG4048'), (300, 'AG4047')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(56, 'AG4052'), (20, 'AG4049'), (299, 'AG4051'), (392, 'AG4050')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23683 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(158, 'AG4061'), (233, 'AG4060'), (159, 'AG4056'), (160, 'AG4055'), (161, 'AG4054'), (298, 'AG4059'), (390, 'AG4058')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (391, 'AG4057')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23687 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(159, 'AG4056'), (191, 'AG3731'), (160, 'AG4055'), (161, 'AG4054'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(389, 'AG4062'), (209, 'AG4064'), (297, 'AG4063')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(57, 'AG4068'), (21, 'AG4065'), (296, 'AG4067'), (388, 'AG4066')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23712 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(162, 'AG4077'), (231, 'AG4076'), (163, 'AG4072'), (164, 'AG4071'), (165, 'AG4070'), (295, 'AG4075'), (386, 'AG4074')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (387, 'AG4073')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(163, 'AG4072'), (191, 'AG3731'), (164, 'AG4071'), (165, 'AG4070'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(385, 'AG4078'), (210, 'AG4080'), (294, 'AG4079')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(58, 'AG4084'), (22, 'AG4081'), (293, 'AG4083'), (384, 'AG4082')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23727 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(166, 'AG4093'), (229, 'AG4092'), (167, 'AG4088'), (168, 'AG4087'), (169, 'AG4086'), (292, 'AG4091'), (382, 'AG4090')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (383, 'AG4089')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(167, 'AG4088'), (191, 'AG3731'), (168, 'AG4087'), (169, 'AG4086'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(381, 'AG4094'), (211, 'AG4096'), (291, 'AG4095')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(59, 'AG4100'), (23, 'AG4097'), (290, 'AG4099'), (380, 'AG4098')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23742 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(170, 'AG4109'), (227, 'AG4108'), (171, 'AG4104'), (172, 'AG4103'), (173, 'AG4102'), (289, 'AG4107'), (378, 'AG4106')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (379, 'AG4105')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23732 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(171, 'AG4104'), (191, 'AG3731'), (172, 'AG4103'), (173, 'AG4102'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(377, 'AG4110'), (212, 'AG4112'), (288, 'AG4111')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(60, 'AG4116'), (24, 'AG4113'), (287, 'AG4115'), (376, 'AG4114')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23743 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23757 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(174, 'AG4125'), (225, 'AG4124'), (175, 'AG4120'), (176, 'AG4119'), (177, 'AG4118'), (286, 'AG4123'), (374, 'AG4122')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (375, 'AG4121')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23747 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(175, 'AG4120'), (191, 'AG3731'), (176, 'AG4119'), (177, 'AG4118'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(373, 'AG4126'), (213, 'AG4128'), (285, 'AG4127')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(61, 'AG4132'), (25, 'AG4129'), (284, 'AG4131'), (372, 'AG4130')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23758 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23772 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(178, 'AG4141'), (223, 'AG4140'), (179, 'AG4136'), (180, 'AG4135'), (181, 'AG4134'), (283, 'AG4139'), (370, 'AG4138')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (371, 'AG4137')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23762 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(179, 'AG4136'), (191, 'AG3731'), (180, 'AG4135'), (181, 'AG4134'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(369, 'AG4142'), (214, 'AG4144'), (282, 'AG4143')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(62, 'AG4148'), (26, 'AG4145'), (281, 'AG4147'), (368, 'AG4146')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23773 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23787 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(182, 'AG4157'), (221, 'AG4156'), (183, 'AG4152'), (184, 'AG4151'), (185, 'AG4150'), (280, 'AG4155'), (366, 'AG4154')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (367, 'AG4153')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23777 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(183, 'AG4152'), (191, 'AG3731'), (184, 'AG4151'), (185, 'AG4150'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(365, 'AG4158'), (215, 'AG4160'), (279, 'AG4159')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(63, 'AG4164'), (27, 'AG4161'), (278, 'AG4163'), (364, 'AG4162')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23788 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23802 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(186, 'AG4173'), (219, 'AG4172'), (187, 'AG4168'), (188, 'AG4167'), (189, 'AG4166'), (277, 'AG4171'), (362, 'AG4170')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (363, 'AG4169')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23792 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(187, 'AG4168'), (191, 'AG3731'), (188, 'AG4167'), (189, 'AG4166'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(361, 'AG4174'), (216, 'AG4176'), (276, 'AG4175')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(64, 'AG4180'), (28, 'AG4177'), (275, 'AG4179'), (360, 'AG4178')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23803 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23339 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(190, 'AG4182'), (217, 'AG4181'), (191, 'AG3731')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23807 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23385 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23386 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16621 of IO tensor non_local float32 %get_tuple_element.3(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23361 of IO tensor non_local uint32 %get_tuple_element.4(8, 2, 128) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16596 of IO tensor non_local int32 %gather.2|NC|(8, 256) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16639 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16656 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 2.624 seconds +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.265 seconds +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.321 seconds +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.068 seconds +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.031 seconds +2025-11-04T21:39:47Z INFO 8867 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:39:50Z INFO 8867 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:39:50Z INFO 8867 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.660 seconds +2025-11-04T21:39:50Z INFO 8867 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 6.013 seconds +2025-11-04T21:39:50Z INFO 8867 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.906 seconds +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.128 seconds +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.334 seconds +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 41.223 seconds +2025-11-04T21:39:51Z INFO 8867 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.477 seconds +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.318 seconds +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:39:52Z INFO 8867 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 1.647 seconds +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.071 seconds +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.719 seconds +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.311 seconds +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.312 seconds +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/LICM]: LICM finished after 0.084 seconds +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.044 seconds +2025-11-04T21:39:54Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:55Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:55Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.180 seconds +2025-11-04T21:39:55Z INFO 8867 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:39:55Z INFO 8867 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:39:55Z INFO 8867 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.120 seconds +2025-11-04T21:39:55Z INFO 8867 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 3.251 seconds +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 384: matmul_128x128x8 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.097 seconds +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.364 seconds +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.365 seconds +2025-11-04T21:39:58Z INFO 8867 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:39:59Z INFO 8867 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:39:59Z INFO 8867 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.713 seconds +2025-11-04T21:39:59Z INFO 8867 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:40:00Z INFO 8867 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8867 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.429 seconds +2025-11-04T21:40:00Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:40:00Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:40:00Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.530 seconds +2025-11-04T21:40:00Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.528 seconds +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 1.059 seconds +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.073 seconds +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.235 seconds +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.095 seconds +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.034 seconds +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.105 seconds +2025-11-04T21:40:01Z INFO 8867 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:40:02Z INFO 8867 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:40:02Z INFO 8867 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.289 seconds +2025-11-04T21:40:02Z INFO 8867 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.381 seconds +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.510 seconds +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.511 seconds +2025-11-04T21:40:03Z INFO 8867 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.079 seconds +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.301 seconds +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/LICM]: LICM finished after 0.129 seconds +2025-11-04T21:40:04Z INFO 8867 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.842 seconds +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.004 seconds +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.020 seconds +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.038 seconds +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.080 seconds +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.073 seconds +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.354 seconds +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:40:06Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:40:07Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.562 seconds +2025-11-04T21:40:07Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:40:07Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.279 seconds +2025-11-04T21:40:07Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:40:07Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.256 seconds +2025-11-04T21:40:07Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.252 seconds +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_4 +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_4 finished after 0.254 seconds +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 1.604 seconds +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.066 seconds +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.364 seconds +2025-11-04T21:40:08Z INFO 8867 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:09Z INFO 8867 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:40:09Z INFO 8867 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.518 seconds +2025-11-04T21:40:09Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:09Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 1.050 seconds +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.199 seconds +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_2 +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_2 finished after 0.182 seconds +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.442 seconds +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.127 seconds +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:10Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.206 seconds +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.190 seconds +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.405 seconds +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.093 seconds +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:40:11Z INFO 8867 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.631 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.050 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.682 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.266 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=True) +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.091 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.063 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.063 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.097 seconds +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:40:12Z INFO 8867 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:40:13Z INFO 8867 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.838 seconds +2025-11-04T21:40:13Z INFO 8867 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:40:13Z INFO 8867 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.839 seconds +2025-11-04T21:40:13Z INFO 8867 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:40:14Z INFO 8867 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:40:14Z INFO 8867 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.312 seconds +2025-11-04T21:40:14Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:14Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.571 seconds +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.571 seconds +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.080 seconds +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:40:15Z INFO 8867 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:40:16Z INFO 8867 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 1.256 seconds +2025-11-04T21:40:16Z INFO 8867 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8867 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 1.257 seconds +2025-11-04T21:40:16Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:16Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:16Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.194 seconds +2025-11-04T21:40:16Z INFO 8867 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:17Z INFO 8867 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:40:17Z INFO 8867 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.516 seconds +2025-11-04T21:40:17Z INFO 8867 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:17Z INFO 8867 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=True) +2025-11-04T21:40:17Z INFO 8867 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.105 seconds +2025-11-04T21:40:17Z INFO 8867 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:17Z INFO 8867 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.955 seconds +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.189 seconds +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 1.152 seconds +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.052 seconds +2025-11-04T21:40:18Z INFO 8867 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:19Z INFO 8867 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:19Z INFO 8867 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.697 seconds +2025-11-04T21:40:19Z INFO 8867 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:20Z INFO 8867 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8867 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.310 seconds +2025-11-04T21:40:20Z INFO 8867 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:40:20Z INFO 8867 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:40:20Z INFO 8867 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.057 seconds +2025-11-04T21:40:20Z INFO 8867 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:21Z INFO 8867 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:21Z INFO 8867 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.138 seconds +2025-11-04T21:40:21Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:21Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:21Z INFO 8867 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.281 seconds +2025-11-04T21:40:21Z INFO 8867 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:21Z INFO 8867 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.742 seconds +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.648 seconds +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.391 seconds +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.061 seconds +2025-11-04T21:40:22Z INFO 8867 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:23Z INFO 8867 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:23Z INFO 8867 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.997 seconds +2025-11-04T21:40:23Z INFO 8867 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:40:23Z INFO 8867 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8867 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.118 seconds +2025-11-04T21:40:23Z INFO 8867 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.183 seconds +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.285 seconds +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.087 seconds +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.039 seconds +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.360 seconds +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.043 seconds +2025-11-04T21:40:24Z INFO 8867 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:25Z INFO 8867 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:40:25Z INFO 8867 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.170 seconds +2025-11-04T21:40:25Z INFO 8867 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:29Z INFO 8867 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 2.238 seconds +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.252 seconds +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 6.679 seconds +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.152 seconds +2025-11-04T21:40:31Z INFO 8867 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.175 seconds +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.068 seconds +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.133 seconds +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 13.425% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'38147.56431'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1,i2.16,i1.128] # id=56430, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_38147 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 230.909us (2.344MiB, est bw: 10.643GB/s, 2.035% of tot. time) for float32<8 x 128> non_local float32 (8, 2, 37984) %'convert.656'[i1.8,i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1] = store float32<8 x 128> TongaSB partitions[2] float32 (2, 297, 8, 128) %'38660.56441'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i1.8,i0.128] # id=56439, src_id=None, , instances=600 # dl = tensor_op_name: convert.656_pftranspose_38660 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i1.8];[i0.128]] -> [[i1.8];[i0.128]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 138.776us (32.031MiB, est bw: 242.024GB/s, 1.223% of tot. time) for bfloat16<128 x 8200> TongaSB partitions[2] bfloat16 (2, 8, 128, 8200) %'all_gather.1_nostride_60851'(init=0.0)[i242_0_0_42945,T_i2,i0.128,i1.8200] = load bfloat16<128 x 8200> non_local bfloat16 (16384,) %'all_gather.1'[8i0.128+1024T_i2+i1.8200] # id=48224, src_id=None, , attrs={'can_read_uninit': True}, instances=16 # dl = tensor_op_name: _add.383 | hlo_id: 383 | [[i0.128];[i1.8200]] -> [[i0.128];[i1.8200]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.519% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input68_local_40013'[i242_0_0_42945,4i243_0_0_0+i243_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input68'[4i243_0_0_0+i243_0_0_1,i242_0_0_42945,i0.128,i1.3072] # id=48239, src_id=None, , instances=16 # dl = tensor_op_name: _dot.413 | hlo_id: 13522 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.519% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input79_local_40116'[i414_0_0_43019,4i415_0_0_0+i415_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input79'[4i415_0_0_0+i415_0_0_1,i414_0_0_43019,i0.128,i1.3072] # id=48434, src_id=None, , instances=16 # dl = tensor_op_name: _dot.757 | hlo_id: 13633 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.519% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input90_local_40219'[i586_0_0_43093,4i587_0_0_0+i587_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input90'[4i587_0_0_0+i587_0_0_1,i586_0_0_43093,i0.128,i1.3072] # id=48629, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1101 | hlo_id: 13744 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.519% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input101_local_40322'[i758_0_0_43167,4i759_0_0_0+i759_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input101'[4i759_0_0_0+i759_0_0_1,i758_0_0_43167,i0.128,i1.3072] # id=48824, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1445 | hlo_id: 13855 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.519% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input112_local_40425'[i930_0_0_43241,4i931_0_0_0+i931_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input112'[4i931_0_0_0+i931_0_0_1,i930_0_0_43241,i0.128,i1.3072] # id=49019, src_id=None, , instances=16 # dl = tensor_op_name: _dot.1789 | hlo_id: 13966 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.519% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input123_local_40528'[i1102_0_0_43315,4i1103_0_0_0+i1103_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input123'[4i1103_0_0_0+i1103_0_0_1,i1102_0_0_43315,i0.128,i1.3072] # id=49214, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2133 | hlo_id: 14077 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 58.827us (12.000MiB, est bw: 213.895GB/s, 0.519% of tot. time) for bfloat16<128 x 3072> TongaSB partitions[2] bfloat16 (2, 8, 128, 3072) %'input134_local_40631'[i1274_0_0_43389,4i1275_0_0_0+i1275_0_0_1,i0.128,i1.3072] = load bfloat16<128 x 3072> {'CrossPassTensor': ''}bfloat16 (8, 2, 128, 3072) %'input134'[4i1275_0_0_0+i1275_0_0_1,i1274_0_0_43389,i0.128,i1.3072] # id=49409, src_id=None, , instances=16 # dl = tensor_op_name: _dot.2477 | hlo_id: 14188 | [[i0.128];[i1.3072]] -> [[i0.128];[i1.3072]] +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.084 seconds +2025-11-04T21:40:32Z INFO 8867 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.017 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.048 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.028 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 5004) %4(init=0.0)[i0.128,i1.4748] = load float32<128 x 4748> float32 (128, 4748) %6[i0.128,i1.4748] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 4748) %10[i0.128,i1.4748] = load float32<128 x 4748> float32 (8, 75968) %'inp'[i0.128,i1.4748] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 5.874% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.008 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.006 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.004 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.013 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.007 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.048 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.020 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 12.331% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 288) %4(init=0.0)[i0.128,i1.32] = load float32<128 x 32> float32 (128, 32) %6[i0.128,i1.32] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 32) %10[i0.128,i1.32] = load float32<128 x 32> float32 (8, 512) %'inp'[i0.128,i1.32] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8867 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 1.549 seconds +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.880 seconds +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.880 seconds +2025-11-04T21:40:34Z INFO 8867 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:40:34Z WARNING 8867 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 79.10 percent of all matmul computation +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.139 seconds +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.519 seconds +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.105 seconds +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.122 seconds +2025-11-04T21:40:35Z INFO 8867 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.238 seconds +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.718 seconds +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.719 seconds +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.143 seconds +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:40:36Z INFO 8867 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:40:37Z INFO 8867 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:40:37Z INFO 8867 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.166 seconds +2025-11-04T21:40:37Z INFO 8867 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:39Z INFO 8867 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:39Z INFO 8867 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.917 seconds +2025-11-04T21:40:40Z INFO 8867 [Tensorizer]: BirCodeGen estimate #instances=76731 in sg0000 +2025-11-04T21:40:40Z INFO 8867 [Tensorizer]: IR signature: 4ff84c54acbc6544f026a446f5148b94ae9276bff6a124479d8afa01ee0cbeeb for nc00/sg0000/TensorizerBIR +2025-11-04T21:40:40Z INFO 8867 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:42Z INFO 8867 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:42Z INFO 8867 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.893 seconds +2025-11-04T21:40:44Z INFO 8867 [Tensorizer]: BirCodeGen estimate #instances=76731 in sg0000 +2025-11-04T21:40:44Z INFO 8867 [Tensorizer]: IR signature: e4921ec20e9cdf2046f63d10a89f0cb06eb90f4f49bb5a6f9d625ab0adc8c8d4 for nc01/sg0000/TensorizerBIR +2025-11-04T21:40:44Z INFO 8867 [Tensorizer]: Weights total number of bytes: 2810120 +2025-11-04T21:40:44Z INFO 8867 [Tensorizer]: Successfully built model. +2025-11-04T21:40:44Z USER 8867 [root/Tensorizer/Tensorizer]: Tensorizer finished after 126.973 seconds +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: End tensorization +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input0 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input1 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input2 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input3 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input4 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input5 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input6 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input7 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input8 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input9 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input10 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input11 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input12 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input13 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input14 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input15 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input16 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input17 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input18 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input19 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input20 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input21 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input22 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input23 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input24 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input25 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input26 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input27 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input28 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input29 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input30 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input31 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input32 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input33 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input34 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input35 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input36 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input37 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input38 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input39 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input40 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input41 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input42 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input43 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input44 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input45 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input46 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input47 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input48 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input49 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input50 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input51 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input52 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input53 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input54 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input55 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input56 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input57 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input58 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input59 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input60 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input61 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input62 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input63 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input64 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input65 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input66 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input67 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input68 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input69 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input70 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input71 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input72 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input73 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input74 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input75 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input76 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input77 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input78 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input79 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input80 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input81 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input82 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input83 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input84 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input85 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input86 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input87 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input88 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input89 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input90 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input91 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input92 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input93 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input94 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input95 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input96 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input97 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input98 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input99 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input100 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input101 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input102 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input103 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input104 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input105 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input106 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input107 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input108 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input109 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input110 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input111 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input112 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input113 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input114 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input115 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input116 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input117 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input118 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input119 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input120 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input121 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input122 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input123 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input124 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input125 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input126 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input127 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input128 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input129 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input130 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input131 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input132 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input133 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input134 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input135 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input136 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input137 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input138 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input139 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input140 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input141 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input142 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input143 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input144 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input145 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input146 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input147 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input148 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input149 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input150 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input151 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input152 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input153 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input154 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input155 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input156 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input157 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input158 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input159 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input160 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input161 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input162 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input163 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input164 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input165 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input166 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input167 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input168 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input169 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input170 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input171 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input172 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input173 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input174 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input175 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input176 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input177 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input178 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input179 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input180 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input181 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input182 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input183 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input184 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input185 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input186 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input187 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input188 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input189 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input190 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input191 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input192 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input193 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input194 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input195 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input196 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input197 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input198 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input199 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input200 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input201 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input202 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input203 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input204 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input205 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input206 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input207 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input208 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input209 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input210 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input211 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input212 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input213 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input214 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input215 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input216 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input217 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input218 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input219 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input220 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input221 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input222 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input223 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input224 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input225 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input226 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input227 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input228 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input229 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input230 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input231 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input232 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input233 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input234 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input235 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input236 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input237 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input238 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input239 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input240 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input241 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input242 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input243 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input244 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input245 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input246 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input247 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input248 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input249 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input250 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input251 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input252 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input253 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input254 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input255 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input256 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input257 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input258 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input259 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input260 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input261 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input262 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input263 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input264 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input265 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input266 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input267 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input268 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input269 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input270 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input271 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input272 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input273 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input274 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input275 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input276 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input277 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input278 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input279 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input280 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input281 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input282 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input283 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input284 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input285 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input286 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input287 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input288 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input289 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input290 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input291 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input292 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input293 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input294 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input295 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input296 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input297 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input298 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input299 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input300 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input301 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input302 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input303 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input304 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input305 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input306 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input307 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input308 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input309 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input310 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input311 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input312 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input313 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input314 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input315 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input316 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input317 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input318 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input319 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input320 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input321 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input322 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input323 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input324 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input325 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input326 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input327 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input328 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input329 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input330 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input331 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input332 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input333 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input334 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input335 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input336 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input337 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input338 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input339 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input340 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input341 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input342 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input343 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input344 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input345 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input346 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input347 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input348 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input349 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input350 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input351 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input352 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input353 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input354 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input355 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input356 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input357 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input358 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input359 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input360 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input361 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input362 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input363 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input364 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input365 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input366 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input367 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input368 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input369 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Network input: input370 +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:44Z INFO 8867 [job.Frontend.0]: Job #0 finished +2025-11-04T21:40:44Z INFO 8867 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:40:44Z INFO 8867 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:40:44Z INFO 8867 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:40:44Z INFO 8867 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: BackendDriver has 2 states with 2 core LNC +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: BackendDriver: no partitions within VNC found. Switching to VNC + flat flow. +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: BackendDriver in_state.num_states 2 with 2 core LNC +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/log-neuron-cc.txt --vnc-nc-per-sengine 2 --link-subgraphs nc00/sg00,nc01/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --unified-backend-and-legacy-codegen --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels spill_reload,scalar_dynamic_offset,io,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:40:44Z INFO 8867 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:40:44Z INFO 9596 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:40:44Z INFO 9596 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:40:44Z INFO 9596 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:40:44Z INFO 9596 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:40:45Z INFO 9596 [BackendDriver]: Backend driver mtBackend: false numModules: 2 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh" +2025-11-04T21:40:45Z INFO 9596 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:40:45Z INFO 9596 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:40:45Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:45Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:45Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.002 seconds +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 397mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.002 seconds +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 397mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z WARNING 9596 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.363.63805}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:45Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.363.63805}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:45Z USER 9596 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.158 seconds +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 570mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.214 seconds +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 597mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:45Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.225 seconds +2025-11-04T21:40:45Z INFO 9596 [BackendPassManager]: curr_vmrss: 597mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z USER 9596 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:45Z INFO 9596 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:45Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.006 seconds +2025-11-04T21:40:45Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 597mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 12120 memory location(s), 2 block(s), and 10056 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:45Z USER 9596 [BackendPassManager]: subgraph_parallel_pass finished after 0.014 seconds +2025-11-04T21:40:45Z INFO 9596 [BackendPassManager]: curr_vmrss: 597mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:45Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:45Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:45Z USER 9596 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.004 seconds +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 597mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.005 seconds +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 597mb, ru_maxrss: 909mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z INFO 9596 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:45Z INFO 9596 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:45 2025 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:45 2025 + +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Total count: 63696 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Matmult: 49295 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: GenericCopy: 6969 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Load: 2196 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: TensorScalarPtr: 1716 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: TensorTensor: 1274 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Save: 338 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Memset: 247 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Select: 58 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Iota: 58 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: TensorReduce: 35 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: DMACopy: 10 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 10 +2025-11-04T21:40:47Z USER 9596 (nc01/sg00) [ModuleForkPass]: unroll finished after 1.434 seconds +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1406mb, ru_maxrss: 1406mb (delta=497mb) +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37236 memory location(s), 1 block(s), and 63696 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=37236 blocks=1 instructions=63696 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:45 2025 + +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Total count: 64856 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Matmult: 49295 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: GenericCopy: 6969 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: TensorScalarPtr: 2276 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Load: 2196 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: TensorTensor: 1274 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Iota: 394 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Save: 378 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Memset: 247 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: DMACopy: 234 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Select: 58 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: TensorReduce: 35 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 234 +2025-11-04T21:40:47Z USER 9596 (nc00/sg00) [ModuleForkPass]: unroll finished after 1.577 seconds +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1234mb, ru_maxrss: 1406mb (delta=497mb) +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37236 memory location(s), 1 block(s), and 64856 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=37236 blocks=1 instructions=64856 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z USER 9596 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.206 seconds +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1234mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z USER 9596 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.181 seconds +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1234mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:47Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 1.778 seconds +2025-11-04T21:40:47Z INFO 9596 [BackendPassManager]: curr_vmrss: 1234mb, ru_maxrss: 1406mb (delta=497mb) +2025-11-04T21:40:47Z USER 9596 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:47Z INFO 9596 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=31704 blocks=2 instructions=126952 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:47Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=31704 blocks=2 instructions=126952 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.005 seconds +2025-11-04T21:40:47Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1234mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 31704 memory location(s), 2 block(s), and 126952 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:47Z USER 9596 [BackendPassManager]: subgraph_parallel_pass finished after 0.010 seconds +2025-11-04T21:40:47Z INFO 9596 [BackendPassManager]: curr_vmrss: 1234mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:47Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:47Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=31704 blocks=2 instructions=126952 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:47Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47751_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47760_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47769_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47778_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47787_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47796_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47805_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47814_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47823_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47832_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47841_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47850_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47859_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47868_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47877_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47886_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47895_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47904_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47913_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47922_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47931_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47940_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47949_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47958_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47967_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47976_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47985_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t47994_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45152_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:47Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45157_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:47Z USER 9596 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.198 seconds +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:47Z USER 9596 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.206 seconds +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:47Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.210 seconds +2025-11-04T21:40:47Z INFO 9596 [BackendPassManager]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:47Z USER 9596 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:47Z INFO 9596 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=31704 blocks=2 instructions=126952 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:47Z USER 9596 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:47Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=31704 blocks=2 instructions=126952 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.018 seconds +2025-11-04T21:40:48Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 31704 memory location(s), 2 block(s), and 126952 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:48Z USER 9596 [BackendPassManager]: subgraph_parallel_pass finished after 0.024 seconds +2025-11-04T21:40:48Z INFO 9596 [BackendPassManager]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:48Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=31704 blocks=2 instructions=126952 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.015 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.016 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.011 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.012 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.062 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.009 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.073 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z WARNING 9596 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.010 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z WARNING 9596 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 5 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:48Z INFO 9596 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:48Z INFO 9596 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.002 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.023 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.026 seconds +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.071 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:48Z INFO 9596 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.002 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.026 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.027 seconds +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.077 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.300 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.015 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.328 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.047 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.017 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1238mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.047 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1239mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.107 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1239mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.010 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1239mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.008 seconds +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1239mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15124 memory location(s), 1 block(s), and 62107 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=15124 blocks=1 instructions=62107 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.108 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1239mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: To Spill 3 multi-layer tensors +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.011 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1239mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.009 seconds +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1239mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16580 memory location(s), 1 block(s), and 64845 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=16580 blocks=1 instructions=64845 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: To Spill 4 multi-layer tensors +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:48Z INFO 9596 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9596 (nc01/sg00) [build_flow_deps]: Allocs: 15132 instructions: 62115 +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9596 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [build_flow_deps]: Allocs: 16590 instructions: 64851 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 159064 edges +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [build_flow_deps]: Done build fdeps 159064 Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 176577 edges +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [build_flow_deps]: Done build fdeps 176577 Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.614 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1269mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15132 memory location(s), 1 block(s), and 62115 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=15132 blocks=1 instructions=62115 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:49 2025 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.667 seconds +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16590 memory location(s), 1 block(s), and 64851 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=16590 blocks=1 instructions=64851 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 31 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.192 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15131 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=15131 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15132 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=15132 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.008 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.008 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1005mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: size = 7338 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.269 seconds +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1006mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16559 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=16559 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1006mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16560 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=16560 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1006mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.009 seconds +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1006mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.009 seconds +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1006mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: found 16085 edges +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: mean: 4.38403 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: median: 6.26591 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 128680 bytes +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: size = 7510 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: lo = 7264 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: total = 7338 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.303 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1006mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: found 16143 edges +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: mean: 4.29907 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: median: 6.2265 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 129144 bytes +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: lo = 7436 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: total = 7510 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.251 seconds +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1007mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:49Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.128 seconds +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1007mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:49Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:49Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:50Z USER 9596 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.115 seconds +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1010mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:50Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 363 PSUM Banks +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 314 PSUM Banks +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 554 PSUM Banks +2025-11-04T21:40:50Z USER 9596 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.378 seconds +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1014mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:50Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 999969358 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3736 bytes +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1773568 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 448 bytes +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: size = 7221 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 260 PSUM Banks +2025-11-04T21:40:50Z USER 9596 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.410 seconds +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1014mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:50Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: found 6775 accumulation groups +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: largest = _dot.9689-t44936_i23 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1005068982 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 3743 bytes +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2805866 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 587 bytes +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:50Z INFO 9596 []: find first defs for local +2025-11-04T21:40:50Z INFO 9596 []: find first defs for global +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: size = 8469 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: found 6947 accumulation groups +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: largest = _dot.9689-t44936_i11 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: 2172 remat count +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:50Z INFO 9596 []: find first defs for local +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Num intervals 7221 Num locations 7221 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: edge: 103119 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: mean: 28.5609 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: median: 18.0935 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:50Z INFO 9596 []: find first defs for global +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: safe = 7197 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: unsafe = 20 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: inf = 2 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: total = 7219 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 7221 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Total: 7219 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (7219) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Rover zone: 0.927 (6692) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.071 (515) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.002 (12) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.125 (904) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (1) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.758 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.000 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.000 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.875 (6314) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.650 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 999969358 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3736 bytes +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1773568 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 448 bytes +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:50Z USER 9596 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.432 seconds +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1028mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:50Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: 2181 remat count +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Num intervals 8469 Num locations 8469 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: edge: 130691 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: mean: 30.8634 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: median: 19.9486 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: safe = 8417 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: unsafe = 48 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: inf = 2 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: total = 8467 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 8469 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:50Z USER 9596 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.119 seconds +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1031mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15133 memory location(s), 1 block(s), and 62114 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:50Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=15133 blocks=1 instructions=62114 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Total: 8467 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (8467) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Rover zone: 0.897 (7593) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.070 (591) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.033 (279) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (4) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.117 (990) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.078 (660) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.365 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.451 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.931 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.805 (6817) +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.582 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.722 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:50Z INFO 9596 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1001742926, 97.6386% input load, 0% output write, 2.36143% spill/reload [sg0000] +2025-11-04T21:40:50Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(9.78087e+08) +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload memory locations +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1005068982 +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 3743 bytes +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2805866 +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 587 bytes +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:51Z USER 9596 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.940 seconds +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1033mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:51Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 4096, 0.0173153% out of total spill/reload dma traffic +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:51Z USER 9596 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.153 seconds +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1037mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16561 memory location(s), 1 block(s), and 64820 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:51Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=16561 blocks=1 instructions=64820 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1007874848, 97.3004% input load, 3.175e-06% output write, 2.69963% spill/reload [sg0000] +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:51Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 65536, 0.00650239% out of total dma traffic(9.80666e+08) +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:51Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 4 SpillSaves and Reloads +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 3739 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 471 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 0 SpillSaves and Reloads +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 3739 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 471 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 999967310 +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3739 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1771520 +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 471 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 4096, 0.000408887% out of total dma traffic +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1001738830, 97.639% input load, 0% output write, 2.36103% spill/reload [sg0000] +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 999967310 +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3739 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1771520 +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 471 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 16416 +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 7 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3662 bytes +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:52Z USER 9596 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 1.322 seconds +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1042mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62110 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:52Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=15126 blocks=1 instructions=62110 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 168 Sb address +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 834 Sb address +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 4 SpillSaves and Reloads +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 3747 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 604 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 0 SpillSaves and Reloads +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 3747 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 604 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1005003446 +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 3747 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2805866 +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 604 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 65536, 0.00650239% out of total dma traffic +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1007809312, 97.3002% input load, 3.1752e-06% output write, 2.6998% spill/reload [sg0000] +2025-11-04T21:40:52Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 271 Sb address +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1005003446 +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 3747 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2805866 +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 604 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 482400 +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 85 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3619 bytes +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:52Z USER 9596 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 1.332 seconds +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1044mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64817 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:52Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:52Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=16555 blocks=1 instructions=64817 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 33 Sb address +2025-11-04T21:40:53Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 188 Sb address +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1470 Sb address +2025-11-04T21:40:53Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 986 Sb address +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 1.095 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1045mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62110 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=15126 blocks=1 instructions=62110 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: reserved space = 166144 bytes +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: spill space = 67584 bytes +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 69632 bytes +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: size = 2 +2025-11-04T21:40:53Z INFO 9596 []: find first defs for local +2025-11-04T21:40:53Z INFO 9596 []: find first defs for global +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: Num intervals 2 Num locations 2 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: lo = 2 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: total = 2 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 69632 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.236 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1046mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62110 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=15126 blocks=1 instructions=62110 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 69632 +2025-11-04T21:40:53Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 355 Sb address +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 69632 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.117 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62110 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=15126 blocks=1 instructions=62110 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 1237 out of 6929 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.019 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62110 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=15126 blocks=1 instructions=62110 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.039 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 62168, number of allocs: 15126 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.014365 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.009 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.008 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.008 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.063 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.013 seconds +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:53Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47751_i1}@SB<0,29224>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47760_i1}@SB<0,31248>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47769_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47778_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47787_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47796_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47805_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47814_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47823_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47832_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47841_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47850_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47859_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47868_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47877_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47886_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47895_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47904_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47913_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47922_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47931_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47940_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47949_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47958_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47967_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47976_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47985_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t47994_i1}@SB<0,29200>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45152_i1}@SB<32,16552>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:53Z WARNING 9596 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45157_i1}@SB<96,17672>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.155 seconds +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.013 seconds +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1047mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [build_flow_deps]: Allocs: 15126 instructions: 62168 +2025-11-04T21:40:54Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 33 Sb address +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 159174 edges +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [build_flow_deps]: Done build fdeps 159174 Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.252 seconds +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1056mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.043 seconds +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1056mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:54Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1590 Sb address +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.628 seconds +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1149mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:54Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:54Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:55Z INFO 9596 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.159 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1149mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64817 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=16555 blocks=1 instructions=64817 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:55Z INFO 9596 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: reserved space = 166152 bytes +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: spill space = 102464 bytes +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 114688 bytes +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: size = 9 +2025-11-04T21:40:55Z INFO 9596 []: find first defs for local +2025-11-04T21:40:55Z INFO 9596 []: find first defs for global +2025-11-04T21:40:55Z USER 9596 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.228 seconds +2025-11-04T21:40:55Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:55Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:55Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=15126 blocks=1 instructions=62168 Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:55Z USER 9596 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.100 seconds +2025-11-04T21:40:55Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15126 memory location(s), 1 block(s), and 62168 instruction(s). Max writers: 298 Max Readers: 10261 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: Num intervals 9 Num locations 9 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: lo = 9 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: total = 9 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 77824 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.287 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64817 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=16555 blocks=1 instructions=64817 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 75776 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 75776 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.131 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64817 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=16555 blocks=1 instructions=64817 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 1349 out of 7174 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.021 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64817 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=16555 blocks=1 instructions=64817 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.042 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 64875, number of allocs: 16555 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.007678 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 3e-06 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.009 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.009 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.008 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.065 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.013 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.149 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.012 seconds +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1150mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9596 (nc00/sg00) [build_flow_deps]: Allocs: 16555 instructions: 64875 +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 176659 edges +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [build_flow_deps]: Done build fdeps 176659 Tue Nov 4 21:40:56 2025 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.214 seconds +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1163mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.048 seconds +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1163mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.543 seconds +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.151 seconds +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=16555 blocks=1 instructions=64875 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.068 seconds +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16555 memory location(s), 1 block(s), and 64875 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:56Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 8.899 seconds +2025-11-04T21:40:56Z INFO 9596 [BackendPassManager]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z USER 9596 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:56Z INFO 9596 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=31681 blocks=2 instructions=127043 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:56Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=31681 blocks=2 instructions=127043 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.009 seconds +2025-11-04T21:40:56Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 31681 memory location(s), 2 block(s), and 127043 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:40:56Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=31681 blocks=2 instructions=127043 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.067 seconds +2025-11-04T21:40:56Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 127957 instruction(s). Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:56Z USER 9596 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:40:56Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=32079 blocks=2 instructions=127957 Max writers: 298 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.219 seconds +2025-11-04T21:40:57Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 127961 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: subgraph_parallel_pass finished after 0.314 seconds +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: curr_vmrss: 1206mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127961 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:40:57Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=15325 blocks=1 instructions=62627 Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: reserved space = 268616 bytes +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: reserved space = 233728 bytes +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: size = 132 +2025-11-04T21:40:57Z INFO 9596 []: find first defs for local +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:40:57Z USER 9596 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.103 seconds +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62627 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z INFO 9596 []: find first defs for global +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: Num intervals 132 Num locations 132 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: lo = 132 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: total = 132 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 77824 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 77824 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 3846144 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 3846144 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 6299648 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:57Z USER 9596 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.262 seconds +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.270 seconds +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127961 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:40:57Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=32079 blocks=2 instructions=127961 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.005 seconds +2025-11-04T21:40:57Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 127961 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: subgraph_parallel_pass finished after 0.011 seconds +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127961 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:40:57Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=15325 blocks=1 instructions=62627 Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:57Z USER 9596 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.066 seconds +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z USER 9596 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.068 seconds +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62627 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.073 seconds +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: curr_vmrss: 1211mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127961 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:40:57Z USER 9596 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:40:57Z INFO 9596 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z INFO 9596 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=15325 blocks=1 instructions=62627 Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z USER 9596 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.280 seconds +2025-11-04T21:40:57Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1213mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62627 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z USER 9596 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.293 seconds +2025-11-04T21:40:57Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1213mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: nc_parallel_pass finished after 0.304 seconds +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: curr_vmrss: 1213mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:57Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127961 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=15325 blocks=1 instructions=62627 Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:40:57Z USER 9596 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.001 seconds +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1213mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62627 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.005 seconds +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1213mb, ru_maxrss: 1406mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:40:57Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:40:57Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:40:57Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=15325 blocks=1 instructions=62627 Max writers: 299 Max Readers: 10261 +2025-11-04T21:40:57Z INFO 9596 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:40:57Z INFO 9596 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:40:57Z INFO 9596 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:40:57Z INFO 9596 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:40:57Z INFO 9596 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:40:57 2025 +2025-11-04T21:40:57Z INFO 9596 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:40:57 2025 +2025-11-04T21:40:58Z INFO 9596 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:40:58Z INFO 9596 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:41:00Z INFO 9596 [post_scheduler]: Time-aware simulation time: 5632118 +2025-11-04T21:41:00Z INFO 9596 [post_scheduler]: Time-aware simulation time: 6030695 +2025-11-04T21:41:01Z INFO 9596 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:01 2025 +2025-11-04T21:41:01Z USER 9596 (nc01/sg00) [ModuleForkPass]: post_sched finished after 3.145 seconds +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1426mb, ru_maxrss: 1426mb (delta=20mb) +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62627 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:01Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=15325 blocks=1 instructions=62627 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:01Z USER 9596 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.012 seconds +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1298mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62627 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:01Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=15325 blocks=1 instructions=62627 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:01Z INFO 9596 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:01 2025 +2025-11-04T21:41:01Z USER 9596 (nc00/sg00) [ModuleForkPass]: post_sched finished after 3.184 seconds +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1298mb, ru_maxrss: 1426mb (delta=20mb) +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.012 seconds +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1282mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.082 seconds +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1283mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62595 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:01Z USER 9596 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.085 seconds +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1283mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:01Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 3.302 seconds +2025-11-04T21:41:01Z INFO 9596 [BackendPassManager]: curr_vmrss: 1283mb, ru_maxrss: 1426mb (delta=20mb) +2025-11-04T21:41:01Z USER 9596 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:01Z INFO 9596 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127929 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:41:01Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=32079 blocks=2 instructions=127929 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.007 seconds +2025-11-04T21:41:01Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1283mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 127929 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:01Z USER 9596 [BackendPassManager]: subgraph_parallel_pass finished after 0.013 seconds +2025-11-04T21:41:01Z INFO 9596 [BackendPassManager]: curr_vmrss: 1283mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:01Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:01Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127929 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:01Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:01Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=15325 blocks=1 instructions=62595 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:01Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4112 PSUM Banks +2025-11-04T21:41:02Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4095 PSUM Banks +2025-11-04T21:41:02Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 307 PSUM Banks +2025-11-04T21:41:02Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 293 PSUM Banks +2025-11-04T21:41:02Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 3017 PSUM Banks +2025-11-04T21:41:02Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 13 Sb address +2025-11-04T21:41:02Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 3176 PSUM Banks +2025-11-04T21:41:02Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 9 Sb address +2025-11-04T21:41:02Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 46 Sb address +2025-11-04T21:41:02Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 43 Sb address +2025-11-04T21:41:02Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 106 Sb address +2025-11-04T21:41:02Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 57 Sb address +2025-11-04T21:41:02Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 60 Sb address +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 156 Sb address +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 997 Sb address +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: moved 24 MM forward +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1201 Sb address +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: moved 18 MM forward +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:41:03Z USER 9596 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.486 seconds +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1289mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62595 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:03Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=15325 blocks=1 instructions=62595 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:03Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:03Z USER 9596 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.753 seconds +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1328mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:03Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:03Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:04Z USER 9596 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.551 seconds +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1394mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62595 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:04Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=15325 blocks=1 instructions=62595 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:04Z USER 9596 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.089 seconds +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1333mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62595 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:04Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=15325 blocks=1 instructions=62595 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:41:04 2025 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [build_flow_deps]: Allocs: 15325 instructions: 62595 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 163226 edges +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [build_flow_deps]: Done build fdeps 163226 Tue Nov 4 21:41:04 2025 +2025-11-04T21:41:04Z USER 9596 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.638 seconds +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1356mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:04Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:04Z USER 9596 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.066 seconds +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1262mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:04Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:04Z USER 9596 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.326 seconds +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1262mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62595 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:04Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=15325 blocks=1 instructions=62595 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:41:04 2025 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬───────────────────────────┬───────┬───────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼───────────────────────────┼───────┼───────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 177 │ 131072 │ +│ Load │ Const -> Internal │ 5 │ 165120 │ +│ Load │ ExternalInput -> Internal │ 2093 │ 977922112 │ +│ Load │ Internal │ 67 │ 1932294 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 335 │ 1771520 │ +└──────────────┴───────────────────────────┴───────┴───────────┘ + +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 66 │ +│ 8 │ 3 │ +│ 16 │ 6 │ +│ 32 │ 58 │ +│ 64 │ 3 │ +│ 88 │ 3 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 1200 │ +│ 1024 │ 3 │ +│ 2048 │ 85 │ +│ 4096 │ 297 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 560 │ +│ 16400 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [build_flow_deps]: Allocs: 16754 instructions: 65334 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 48447 #MatMult-Transposes 10302 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ReportStats]: IO Tensor size combined: 5789976768 +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60747_i1 │ Internal │ bfloat16 │ 3153920 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ all_gather.1_nostride_60851_i10 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i12 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i9 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i11 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i14 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i13 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i8 │ Internal │ bfloat16 │ 2099200 │ +└─────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:04Z USER 9596 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.026 seconds +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62595 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 181266 edges +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [build_flow_deps]: Done build fdeps 181266 Tue Nov 4 21:41:04 2025 +2025-11-04T21:41:04Z USER 9596 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.304 seconds +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:04Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 208 │ 139264 │ +│ DMACopy │ Internal -> ExternalOutput │ 224 │ 7516192768 │ +│ Load │ Const -> Internal │ 10 │ 2678024 │ +│ Load │ ExternalInput -> Internal │ 2094 │ 977922144 │ +│ Load │ Internal │ 80 │ 4455494 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 381 │ 2805834 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 32 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:41:04Z INFO 9596 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 74 │ +│ 8 │ 4 │ +│ 16 │ 6 │ +│ 32 │ 61 │ +│ 64 │ 7 │ +│ 88 │ 3 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 1201 │ +│ 1024 │ 18 │ +│ 2048 │ 85 │ +│ 4096 │ 325 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 560 │ +│ 16384 │ 2 │ +│ 16400 │ 8 │ +│ 18992 │ 2 │ +│ 1048576 │ 224 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 49291 #MatMult-Transposes 11086 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ReportStats]: IO Tensor size combined: 5789976768 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60747_i0 │ Internal │ bfloat16 │ 3153920 │ +│ -t80228 │ Internal │ float32 │ 2562048 │ +│ -t80222 │ Internal │ float32 │ 2562048 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ -t80225 │ Internal │ float32 │ 2430976 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ all_gather.1_nostride_60851_i3 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i2 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i1 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i0 │ Internal │ bfloat16 │ 2099200 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.029 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65334 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 3.820 seconds +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Inputs to assign_trigger_engine: modules=2 functions=2 allocs=32079 blocks=2 instructions=127929 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 452 DMA instructions. Moved 71 DMA instructions to CC's engines. +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 405 DMA instructions. Moved 70 DMA instructions to CC's engines. +2025-11-04T21:41:05Z INFO 9596 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: assign_trigger_engine finished after 0.085 seconds +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 127929 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=127929 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=15325 blocks=1 instructions=62595 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=16754 blocks=1 instructions=65334 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.021 seconds +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.022 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.028 seconds +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=2 functions=2 allocs=32079 blocks=2 instructions=128047 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: assign_hwdge_engine finished after 0.022 seconds +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 128047 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=128047 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 19 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 260 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 248 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 117 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 31 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2324 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 7 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.012 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 7 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 226 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 239 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 87 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 24 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2099 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 5 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.012 seconds +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.019 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.019 seconds +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.039 seconds +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=128047 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:05Z USER 9596 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:05Z INFO 9596 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.002 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: nc_parallel_pass finished after 0.007 seconds +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=128047 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:05Z USER 9596 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.112 seconds +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z USER 9596 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.122 seconds +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.129 seconds +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: curr_vmrss: 1265mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:05Z USER 9596 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:05Z INFO 9596 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=128047 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z USER 9596 (nc00) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:05Z USER 9596 (nc01) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:05Z INFO 9596 (nc00) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:05Z INFO 9596 (nc01) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 63033 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 52307 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 67644 +2025-11-04T21:41:05Z INFO 9596 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 67644 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 55881 +2025-11-04T21:41:05Z INFO 9596 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 55881 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [DepReduction]: Finished dependency reduction: 421525 removed, new total 20630 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: dep_reduction finished after 0.898 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.014 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1356mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.038 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1358mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.009 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1359mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65393 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=16754 blocks=1 instructions=65393 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1976/1976 (100% DGE) + power-of-2 partition : 1976/1983 (99.647% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1977/1984 (99.6472% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 38/38 (100% DGE) + power-of-2 partition : 38/483 (7.8675% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 38/483 (7.8675% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 197 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 234/234 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: lower_dma finished after 0.055 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1360mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65395 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=16754 blocks=1 instructions=65395 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: expand_all_engine finished after 0.013 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1359mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65395 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=16754 blocks=1 instructions=65395 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [DepReduction]: Finished dependency reduction: 390093 removed, new total 18737 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: dep_reduction finished after 1.093 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.015 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.094 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65395 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=16754 blocks=1 instructions=65395 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.042 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.011 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62654 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=15325 blocks=1 instructions=62654 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 1976/1976 (100% DGE) + power-of-2 partition : 1976/1981 (99.7476% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 1977/1982 (99.7477% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 29/29 (100% DGE) + power-of-2 partition : 29/419 (6.92124% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 29/419 (6.92124% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 169 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 9/9 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: lower_dma finished after 0.066 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62655 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=15325 blocks=1 instructions=62655 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: expand_inst_late finished after 0.112 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1361mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65629 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=16754 blocks=1 instructions=65629 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: expand_all_engine finished after 0.020 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1361mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62655 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=15325 blocks=1 instructions=62655 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [SeqInstOpt]: Removing 230 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.012 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1361mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 65399 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=16754 blocks=1 instructions=65399 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: lower_sync finished after 0.032 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1361mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 69037 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=16754 blocks=1 instructions=69037 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: lower_act finished after 0.013 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1361mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 69179 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=16754 blocks=1 instructions=69179 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.113 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1366mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62655 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=15325 blocks=1 instructions=62655 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: expand_inst_late finished after 0.127 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1390mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62664 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=15325 blocks=1 instructions=62664 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: lower_dve finished after 0.195 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1385mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 69179 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=16754 blocks=1 instructions=69179 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [SeqInstOpt]: Removing 7 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.011 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1364mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 62657 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=15325 blocks=1 instructions=62657 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: lower_ap finished after 0.016 seconds +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1364mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 69179 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z USER 9596 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:06Z INFO 9596 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=16754 blocks=1 instructions=69179 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: lower_sync finished after 0.039 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1365mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 65686 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=15325 blocks=1 instructions=65686 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: lower_act finished after 0.015 seconds +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1365mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 65827 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z USER 9596 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:06Z INFO 9596 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=15325 blocks=1 instructions=65827 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:06Z INFO 9596 (nc01/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [REG_Allocator]: size = 4 +2025-11-04T21:41:06Z INFO 9596 []: find first defs for local reg +2025-11-04T21:41:06Z INFO 9596 []: find first defs for global reg +2025-11-04T21:41:06Z INFO 9596 (nc00/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: lo = 4 +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: total = 4 +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:07Z USER 9596 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.207 seconds +2025-11-04T21:41:07Z INFO 9596 (nc00) [CoreForkPass]: curr_vmrss: 1396mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 69179 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 (nc01) [CoreForkPass]: lower_dve finished after 0.271 seconds +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1385mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 65827 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z USER 9596 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=15325 blocks=1 instructions=65827 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z USER 9596 (nc01) [CoreForkPass]: lower_ap finished after 0.023 seconds +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1344mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 65827 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z USER 9596 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=15325 blocks=1 instructions=65827 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: size = 2 +2025-11-04T21:41:07Z INFO 9596 []: find first defs for local reg +2025-11-04T21:41:07Z INFO 9596 []: find first defs for global reg +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: lo = 2 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: total = 2 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:07Z USER 9596 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.272 seconds +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: curr_vmrss: 1362mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 65827 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z USER 9596 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: nc_parallel_pass finished after 2.231 seconds +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: curr_vmrss: 1344mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: vnc_remote_addr_map finished after 0.009 seconds +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: curr_vmrss: 1323mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 135006 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: Running vnc_link +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z INFO 9596 [VncLink]: Found 0 remote updates +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: vnc_link finished after 0.003 seconds +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: curr_vmrss: 1323mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 135006 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=16754 blocks=1 instructions=69179 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=15325 blocks=1 instructions=65827 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z USER 9596 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.176 seconds +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1329mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 65827 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z USER 9596 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.240 seconds +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1322mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 69179 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 0.252 seconds +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: curr_vmrss: 1322mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:41:07Z INFO 9596 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.028 seconds +2025-11-04T21:41:07Z INFO 9596 (sg00) [SubgraphForkPass]: curr_vmrss: 1323mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9596 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 135006 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: subgraph_parallel_pass finished after 0.034 seconds +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: curr_vmrss: 1322mb, ru_maxrss: 1426mb (delta=0mb) +2025-11-04T21:41:07Z USER 9596 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:07Z INFO 9596 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z USER 9596 (nc00/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:07Z USER 9596 (nc01/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=15325 blocks=1 instructions=65827 Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=16754 blocks=1 instructions=69179 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:07Z INFO 9596 (nc01/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64234 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249505 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:07Z INFO 9596 (nc00/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64234 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249506 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 49584 │ +│ LDWEIGHTS │ 49584 │ +│ CAST │ 5358 │ +│ EVENT_SEMAPHORE │ 3029 │ +│ UNKNOWN(0xd4) │ 2128 │ +│ ACTIVATE │ 2064 │ +│ COPY │ 1400 │ +│ TENSOR_TENSOR │ 1273 │ +│ UNKNOWN(0xd8) │ 589 │ +│ PSEUDO_DMA_TRIGGER │ 565 │ +│ TENSOR_SCALAR │ 259 │ +│ MEMSET │ 229 │ +│ ACT_TABLE_LOAD │ 141 │ +│ TENSOR_SCALAR_ADDR │ 113 │ +│ UNKNOWN(0xda) │ 68 │ +│ UNKNOWN(0xd9) │ 59 │ +│ UNKNOWN(0xe8) │ 58 │ +│ RECIPROCAL │ 57 │ +│ TENSOR_REDUCE │ 30 │ +│ STREAM_SHUFFLE │ 24 │ +│ LOAD_MASK_SELECT │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 2 │ +│ IOTA │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 4360 │ +│ Scalar │ 8338 │ +│ Tensor │ 100534 │ +│ SyncDMA │ 0 │ +│ Vector │ 3370 │ +│ Sync │ 47 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:08Z USER 9596 (nc01/sg00) [Codegen]: isa_gen finished after 0.654 seconds +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 3824 │ +│ qDVESpillReload0 │ 1392 │ +│ qPoolSpillReload0 │ 39513 │ +│ qSPIO0 │ 33 │ +│ qSPSpillReload0 │ 68 │ +└───────────────────┴────────────────┘ + +Total descriptors: 44830 (0.000668019 GB) +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ all_gather.2 │ Internal │ float32 │ 1 │ +│ t42947_45187_i0_remote_0 │ Internal │ bfloat16 │ 1 │ +│ dot.46-buffer-79928 │ Internal │ bfloat16 │ 1 │ +│ 38660.56441_i369 │ Internal │ float32 │ 1 │ +│ 38660.56441_i537 │ Internal │ float32 │ 1 │ +│ _dot.6985-t47928_i1 │ Internal │ bfloat16 │ 1 │ +│ _dot.2513-t47811_i1 │ Internal │ bfloat16 │ 1 │ +│ input0 │ ExternalInput │ int32 │ 1 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└──────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:08Z USER 9596 (nc01/sg00) [Codegen]: dma_desc_gen finished after 0.010 seconds +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:08Z WARNING 9596 (nc01/sg00) [Codegen]: Found 251 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:08Z USER 9596 (nc01/sg00) [Codegen]: debug_info_gen finished after 0.128 seconds +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 50488 │ +│ LDWEIGHTS │ 50488 │ +│ CAST │ 5358 │ +│ EVENT_SEMAPHORE │ 3638 │ +│ UNKNOWN(0xd4) │ 2362 │ +│ ACTIVATE │ 2071 │ +│ COPY │ 1634 │ +│ TENSOR_TENSOR │ 1274 │ +│ TENSOR_SCALAR_ADDR │ 674 │ +│ PSEUDO_DMA_TRIGGER │ 652 │ +│ UNKNOWN(0xd8) │ 589 │ +│ IOTA │ 394 │ +│ UNKNOWN(0xda) │ 293 │ +│ TENSOR_SCALAR │ 261 │ +│ MEMSET │ 241 │ +│ GATHER │ 240 │ +│ POOL_BUFFER_LOAD │ 240 │ +│ ACT_TABLE_LOAD │ 142 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 59 │ +│ UNKNOWN(0xe8) │ 58 │ +│ TENSOR_REDUCE │ 35 │ +│ LOAD_MASK_SELECT │ 25 │ +│ STREAM_SHUFFLE │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 4 │ +│ UNKNOWN(0xe5) │ 2 │ +│ STREAM_TRANSPOSE │ 1 │ +│ NOP │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 6786 │ +│ Scalar │ 8607 │ +│ Tensor │ 102336 │ +│ SyncDMA │ 0 │ +│ Vector │ 4024 │ +│ Sync │ 78 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:08Z USER 9596 (nc01/sg00) [ModuleForkPass]: codegen finished after 0.812 seconds +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1432mb, ru_maxrss: 1432mb (delta=6mb) +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 15325 memory location(s), 1 block(s), and 65827 instruction(s). Max writers: 299 Max Readers: 10261 +2025-11-04T21:41:08Z USER 9596 (nc00/sg00) [Codegen]: isa_gen finished after 0.829 seconds +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 4052 │ +│ qDVESpillReload0 │ 1844 │ +│ qPoolSpillReload0 │ 50340 │ +│ qSPIO0 │ 51 │ +│ qSPSpillReload0 │ 330 │ +└───────────────────┴────────────────┘ + +Total descriptors: 56617 (0.000843659 GB) +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [Codegen]: Tensors with largest descriptor count: +┌─────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────┼───────────────┼──────────┼──────────────────┤ +│ all_reduce.112 │ Internal │ bfloat16 │ 2 │ +│ rng.1 │ Internal │ float32 │ 2 │ +│ custom_call.143 │ Internal │ float32 │ 2 │ +│ get_tuple_element.3 │ Internal │ float32 │ 2 │ +│ get_tuple_element.5 │ Internal │ float32 │ 2 │ +│ get_tuple_element.2 │ Internal │ uint32 │ 2 │ +│ custom_call.142 │ Internal │ float32 │ 2 │ +│ get_tuple_element.1 │ Internal │ float32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└─────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:08Z USER 9596 (nc00/sg00) [Codegen]: dma_desc_gen finished after 0.016 seconds +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:08Z WARNING 9596 (nc00/sg00) [Codegen]: Found 224 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:08Z USER 9596 (nc00/sg00) [Codegen]: debug_info_gen finished after 0.177 seconds +2025-11-04T21:41:08Z USER 9596 (nc00/sg00) [ModuleForkPass]: codegen finished after 1.058 seconds +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1404mb, ru_maxrss: 1432mb (delta=6mb) +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 16754 memory location(s), 1 block(s), and 69179 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:08Z USER 9596 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:08Z USER 9596 [BackendPassManager]: mod_parallel_pass finished after 1.077 seconds +2025-11-04T21:41:08Z INFO 9596 [BackendPassManager]: curr_vmrss: 1327mb, ru_maxrss: 1432mb (delta=6mb) +2025-11-04T21:41:08Z USER 9596 [BackendPassManager]: Running hbm_usage +2025-11-04T21:41:08Z INFO 9596 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 480.000B │ 126.156KB │ +│ CCE │ 0.000B │ 674.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 2.000KB │ 161.000KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc00/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.659GB │ +│ Model Code │ 7.436MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.008MB │ +│ DMA Ring IO │ 2.469KB │ +│ DMA Ring Spill │ 961.828KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 416.000B │ 110.594KB │ +│ CCE │ 0.000B │ 506.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 1.500KB │ 139.750KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:08Z INFO 9596 (nc01/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.658GB │ +│ Model Code │ 7.120MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.008MB │ +│ DMA Ring IO │ 1.906KB │ +│ DMA Ring Spill │ 757.016KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:08Z INFO 9596 [HBMUsage]: Total estimated HBM usage is: 3.672GB +2025-11-04T21:41:08Z USER 9596 [BackendPassManager]: hbm_usage finished after 0.010 seconds +2025-11-04T21:41:08Z INFO 9596 [BackendPassManager]: curr_vmrss: 1313mb, ru_maxrss: 1432mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9596 [BackendPassManager]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 135006 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:08Z USER 9596 [BackendPassManager]: Running neff_packager +2025-11-04T21:41:08Z INFO 9596 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=2 allocs=32079 blocks=2 instructions=135006 Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:09Z WARNING 9596 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh/metrics.json +2025-11-04T21:41:09Z WARNING 9596 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:41:09Z INFO 9596 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff +2025-11-04T21:41:09Z INFO 9596 [NeffFileWriter]: IR signature: aa21a98bd014ba8ad788e4e8586a1222 for neff artifacts +2025-11-04T21:41:09Z USER 9596 [BackendPassManager]: neff_packager finished after 0.401 seconds +2025-11-04T21:41:09Z INFO 9596 [BackendPassManager]: curr_vmrss: 1314mb, ru_maxrss: 1432mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9596 [BackendPassManager]: Output has 2 module(s), 2 function(s), 32079 memory location(s), 2 block(s), and 135006 instruction(s). Max writers: 299 Max Readers: 11045 +2025-11-04T21:41:09Z INFO 9596 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000072 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.005867 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000107 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.005882 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000072 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.005867 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000065 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000065 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000065 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.005867 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:41:09Z INFO 9596 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=local (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_4 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.003906 MB │ +│ split_2 │ int32 │ 1 │ 0.003906 MB │ +│ split_3 │ float32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:09Z INFO 9596 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.656 │ float32 │ 1 │ 2.320312 MB │ +│ all_reduce.111 │ bfloat16 │ 1 │ 0.031250 MB │ +│ get_tuple_element.1 │ float32 │ 1 │ 0.007812 MB │ +│ get_tuple_element.2 │ uint32 │ 1 │ 0.007812 MB │ +│ all_reduce.112 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:09Z INFO 9596 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg00, addr_space=local (complete data located at nc01/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_3 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:09Z INFO 9596 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:41:09Z INFO 8867 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:41:09Z INFO 8867 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:41:09Z INFO 8867 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:41:09Z INFO 8867 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:41:09Z INFO 8867 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh +2025-11-04T21:41:09Z INFO 8867 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:41:09Z INFO 8867 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:41:09Z INFO 8867 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:41:09Z INFO 8867 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:41:09Z INFO 8867 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:41:09Z INFO 8867 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:41:09Z INFO 8867 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:41:09Z INFO 8867 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:41:09Z INFO 8867 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:41:09Z INFO 8867 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh/hlo_netlist.json +2025-11-04T21:41:10Z INFO 8867 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk2/neuronxcc-a95le4bh/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:41:10Z INFO 8867 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:41:10Z INFO 8867 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:41:10Z INFO 8867 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:41:10Z INFO 8867 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:41:10Z INFO 8794 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk2/metaneff.pb b/token_generation_model/_tp0_bk2/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..bd332fd606a6e33ec5c19edbb11115c32e65401a --- /dev/null +++ b/token_generation_model/_tp0_bk2/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727e91b123a070acfa5efd011cab6f15a84999e67c840912471bb7fb89db4363 +size 3988817 diff --git a/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb b/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..17f7968e780ad6a7398d6f85dc79214841fa8e4c --- /dev/null +++ b/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6be812b5bde1f556477287f12ff66ec425835fb5ee9b9d91dfb1e4bdb168c00 +size 4075105 diff --git a/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff b/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff new file mode 100644 index 0000000000000000000000000000000000000000..4944196ff04153b916e82be2739fc0e097fb718a --- /dev/null +++ b/token_generation_model/_tp0_bk2/model.MODULE_bac42b9b464c64624582+1ea12800.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0074929c6140282979070bdfc412360950dfa5e2dbe995e0afa9cc7a6661e809 +size 6657024 diff --git a/token_generation_model/_tp0_bk2/neuron_config.json b/token_generation_model/_tp0_bk2/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec6714b90b7f96ff8c716ba7472b5eceb74d6e78 --- /dev/null +++ b/token_generation_model/_tp0_bk2/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 8, + "bucket_n_active_tokens": false, + "buckets": [ + 512 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": [ + 512 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk3/command.txt b/token_generation_model/_tp0_bk3/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..03692c74cf04da4117c4460f218394c54addec98 --- /dev/null +++ b/token_generation_model/_tp0_bk3/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb --output model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk3/compile_flags.MODULE_8aa2bc135acfce1f4a61+bd0ab490.json b/token_generation_model/_tp0_bk3/compile_flags.MODULE_8aa2bc135acfce1f4a61+bd0ab490.json new file mode 100644 index 0000000000000000000000000000000000000000..39f8f388e9a847b53bfc3f9b098bf7f70e192733 --- /dev/null +++ b/token_generation_model/_tp0_bk3/compile_flags.MODULE_8aa2bc135acfce1f4a61+bd0ab490.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=2", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk3/global_metric_store.json b/token_generation_model/_tp0_bk3/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..eb9063142333749c22a277726a85ae24cb331005 --- /dev/null +++ b/token_generation_model/_tp0_bk3/global_metric_store.json @@ -0,0 +1,590 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.25508117675781, + "StaticProfiler::AveragePartitionUtilization": 89.86397552490234, + "StaticProfiler::AveragePeUtilization": 76.03650665283203, + "StaticProfiler::LocalizationEfficiency": 256.87542724609375, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 263.38134765625, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 2.613478660583496, + "AffinePredicateResolution": 0.08293986320495605, + "AliasDependencyElimination": 0.003684520721435547, + "AliasDependencyInduction": 0.689274787902832, + "AliasDependencyReset": 0.7229092121124268, + "BFComputeCutting": 0.06791234016418457, + "BirCodeGenLoop": 1.860677719116211, + "CCOpFusion": 0.693291187286377, + "CanonicalizeConv": 9.999999974752427e-07, + "CanonicalizeDAGForPGTiling": 0.17862343788146973, + "CanonicalizeForTensorizer": 0.00043899999582208693, + "CanonicalizeIR": 0.14400553703308105, + "Canonicalizer": 0.014510000124573708, + "CoalesceCCOp": 0.19598054885864258, + "CommuteConcat": 0.031813621520996094, + "DMALocalityOpt": 0.043062448501586914, + "DMAProfiler": 0.08422327041625977, + "DMATilingProfiler": 0.12095189094543457, + "DataLocalityOpt": 3.2372589111328125, + "DataStreaming": 0.16935110092163086, + "DeConcat": 0.06088733673095703, + "DeadCodeElimination": 0.033219099044799805, + "DeadStoreElimination": 1.185746669769287, + "DelinearIndices": 0.43347668647766113, + "Delinearization": 0.16624140739440918, + "DelinearizeSPMD": 0.21023201942443848, + "DoNothing": 0.0003592967987060547, + "DramToDramTranspose": 0.3406989574432373, + "DumpGraphAndMetadata": 0.13997292518615723, + "EliminateDivs": 0.20760846138000488, + "ExpandBatchNorm": 0.09476161003112793, + "ExpandISAMacro": 0.09129691123962402, + "FactorizeBlkDims": 0.5268166065216064, + "FactorizeThreadAxesInFreeDims": 0.08941841125488281, + "FlattenMacroLoop": 0.10271286964416504, + "GenericAccessSimplifier": 0.028722524642944336, + "HoistCompute": 9.000000136438757e-05, + "IdentifyCrossPassTensors": 0.0001990000018849969, + "InferInitValue": 1.3908584117889404, + "InferIntrinsicOnCC": 0.3405766487121582, + "InferNeuronTensor": 1.702704668045044, + "InferNonlocalTensors": 5.50789213180542, + "InferPSumTensor": 1.3714420795440674, + "InferShardAxis": 9.253787994384766, + "InferSharedMemLoc": 0.11624884605407715, + "InlineNativeKernels": 0.0558774471282959, + "InsertCoreBarrier": 0.13271498680114746, + "InsertIOTransposes": 0.8883798122406006, + "InsertImplicitShardAxisBeforeISel": 0.41409993171691895, + "InsertLocalTransposes": 0.7429473400115967, + "InsertOffloadedTransposes": 0.13221502304077148, + "LICM": 0.12227439880371094, + "LateLegalizeInst": 0.15507245063781738, + "LateLegalizePostSplit": 0.10195636749267578, + "LateLowerReshapeOp": 0.03727078437805176, + "LateLowerTensorOp": 0.39714479446411133, + "LateNeuronInstComb": 1.150972604751587, + "LayoutPreprocessing": 0.9332830905914307, + "LayoutPreprocessingAndAnalysis": 1.391254186630249, + "LayoutRequirementAnalysis": 0.44931793212890625, + "LegalizeCCOpLayout": 0.13326668739318848, + "LegalizeOpLevelAlias": 0.046427011489868164, + "LegalizePartitionReduce": 0.08784890174865723, + "LegalizeSundaAccess": 0.9785583019256592, + "LegalizeSundaMacro": 0.6998870372772217, + "LegalizeType": 0.15176653861999512, + "LocalLayoutOpt": 0.6880152225494385, + "LoopFusion": 0.38782477378845215, + "LoopSplitting": 0.029788732528686523, + "LowerBroadcast": 0.1136465072631836, + "LowerCCOpBlockAxis": 0.22148585319519043, + "LowerComplexBroadcast": 0.07951140403747559, + "LowerIntrinsics": 1.3014023303985596, + "LowerShardAxis": 0.22774982452392578, + "LowerTensorOp": 0.6523504257202148, + "LowerToSendRecv": 0.16421246528625488, + "LowerTranspose": 0.49215221405029297, + "MacroGeneration": 2.668619394302368, + "MaskPropagation": 0.11704516410827637, + "MemcastMotion": 0.00014400000509340316, + "MemcpyElimination": 9.137748718261719, + "MutateDataType": 0.0424351692199707, + "NeuronAliasDependencyInduction": 0.019921302795410156, + "NeuronAliasDependencyReset": 0.035854339599609375, + "NeuronInstComb": 0.40268683433532715, + "NeuronLICM": 0.27900099754333496, + "NeuronLoopFusion": 1.5907442569732666, + "NeuronLoopInterchange": 0.06258201599121094, + "NeuronSimplifier": 0.507253885269165, + "NeuronSimplifyPredicates": 0.28333568572998047, + "NeuronValueNumbering": 0.1284947395324707, + "OptimizeAliasedCopyChain": 0.024678707122802734, + "OptimizeNKIKernels": 1.6666042804718018, + "PAGLayoutOpt": 15.41032886505127, + "PComputeCutting": 0.3097984790802002, + "PGLayoutTilingPipeline": 40.68231964111328, + "PGTiling": 5.982719421386719, + "PadElimination": 0.018638134002685547, + "ParAxesAnnotation": 14.657413482666016, + "PartialLoopFusion": 1.228170394897461, + "PartialSimdFusion": 0.8126676082611084, + "PenguinizeFunctions": 0.00020300000323913991, + "PerfectLoopNest": 0.07282352447509766, + "PruneFunctions": 0.00043799998820759356, + "RecognizeOpIdiom": 0.12730193138122559, + "Recompute": 0.009931087493896484, + "RelaxPredicates": 0.11692571640014648, + "Rematerialization": 0.1635129451751709, + "RemoveOptimizationBarriers": 0.0004330000083427876, + "RemoveShardedPartitionAxes": 1.2620294094085693, + "ReshapeWeights": 0.03252673149108887, + "ResolveAccessConflict": 0.19570350646972656, + "ResolveComplicatePredicates": 0.0764627456665039, + "RewriteReplicationMatmul": 0.046774864196777344, + "RewriteWeights": 0.0912783145904541, + "SFKVectorizer": 6.535493850708008, + "ScatterMotion": 0.003582000033929944, + "ShardingPropagationAnalysis": 0.7250044345855713, + "SimpleAllReduceTiling": 0.07016158103942871, + "Simplifier": 0.10029053688049316, + "SimplifyMacroPredicates": 0.2846403121948242, + "SimplifyNeuronTensor": 0.39885902404785156, + "SimplifySlice": 0.032215118408203125, + "SimplifyTensor": 0.2902982234954834, + "SpillPSum": 0.679192304611206, + "SplitAPUnionSets": 0.5018737316131592, + "SplitAccGrp": 0.04875993728637695, + "StaticProfiler": 0.1320803165435791, + "StaticTransposeLocalTensor": 0.2515878677368164, + "SundaISel": 1.4164702892303467, + "TCTransform": 0.03308296203613281, + "TensorInitialization": 0.18078994750976563, + "TensorOpSimplifier": 0.7264657020568848, + "TensorOpTransform": 2.3855443000793457, + "TensorizerLegalizationPass": 0.00020799999765586108, + "TileCCOps": 0.2816429138183594, + "TilingProfiler": 0.4777710437774658, + "TransformConvOp": 0.13524365425109863, + "TritiumFusion": 1.4637632369995117, + "ValueNumbering": 0.09920430183410645, + "VectorizeDMA": 0.6684191226959229, + "VectorizeMatMult": 0.07972240447998047, + "VerifySupportedOps": 0.00046999999904073775, + "WeightCoalescing": 0.06412029266357422, + "ZeroSizeTensorElimination": 0.00037407875061035156, + "algsimp": 0.002443999983370304, + "batchnorm_expander": 0.0010989999864250422, + "boundary-marker-removal": 0.00046499999007210135, + "call-inliner": 0.0004149999876972288, + "canonicalize-boundary-marker": 0.0006760000251233578, + "collective-stream-id-checker": 9.600000339560211e-05, + "comparison-expander": 0.0006399999838322401, + "computation-deduplicator": 0.0005729999975301325, + "config-lowering": 0.00032900000223889947, + "constant_folding": 0.00021300000662449747, + "cse": 0.0009490000084042549, + "dce": 8.399999933317304e-05, + "dynamic-slice-transpose": 0.0003370000049471855, + "eliminate-redundant-compare": 0.00018200000340584666, + "emit-offloaded-dropout": 0.00041199999395757914, + "flatten-call-graph": 0.0004990000161342323, + "fuse-send-recv": 0.002171000000089407, + "hilo-conditional-to-select": 0.00016199999663513154, + "hilo::LegalizeAlias": 0.003091000020503998, + "hilo::NeuronInstCombine": 0.0004180000105407089, + "hilo::NeuronOpFusion": 0.00043799998820759356, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00031400000443682075, + "hilo::ScheduleFusion": 4.8999998398358e-05, + "hilo::SixtyFourHack": 0.00030300000798888505, + "hilo::VerifyAliasing": 0.00012599999899975955, + "hlo-mac-count": 0.00790099985897541, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0015510000521317124, + "legalize-ccops-for-tensorizer": 3.600000127335079e-05, + "legalize-compare": 0.00043799998820759356, + "lower-argminmax-custom-call": 0.0003330000035930425, + "map-inline": 0.0008950000046752393, + "metadata-naming": 0.0016899999463930726, + "mlir::detail::OpToOpPassAdaptor": 0.0001900000061141327, + "mlir::hlo::MhloToPyPenguin": 0.0982000008225441, + "mlir::mhlo::LowerComplexExtraPass": 0.004832000005990267, + "mlir::mhlo::LowerComplexPass": 0.0002789999998640269, + "native-to-custom-softmax": 0.0005840000230818987, + "native-to-custom-softmax-dx": 0.0005959999980404973, + "neuron-hlo-verifier": 0.025195999071002007, + "operand_upcaster": 0.0010349999647587538, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.07145500183105469, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 1.9999999949504854e-06, + "replace-minimum-constant": 0.0003389999910723418, + "reshape-mover": 9.500000305706635e-05, + "simplify-concat": 0.0027129999361932278, + "simplify-while-loops": 0.00010199999815085903, + "transform-variadic-reduce": 0.0007019999902695417, + "tuple-simplifier": 0.00018699999782256782, + "unpack-nested-aws-ntwsr": 0.00046300000394694507, + "unroll-while-loop": 1.4999999621068127e-05 + }, + "hilo": { + "HloMacCount": 7368572928.0, + "Traffic": 3915395840.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 121884, + "StaticProfiler::AifUb": 15.004831314086914, + "StaticProfiler::ArithmeticIntensityTensorizer": 38.543724060058594, + "StaticProfiler::AverageDmaLength": 3266.278076171875, + "StaticProfiler::DDRTransferBytes": 2254072788, + "StaticProfiler::InternalTransferBytes": 578006208, + "StaticProfiler::LoadExpanded": 550182, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 558531, + "StaticProfiler::TotalDynamicInstancesCount": 148770, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 119986, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 82081, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 27301, + "TilingProfiler::PfTransposeInstructionsForIo": 23848, + "TilingProfiler::PfTransposeInstructionsForLocal": 1491, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 285, + "TilingProfiler::SimdInstructionsAfterTiling": 3136, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 9.999999974752427e-07, + "CanonicalizeForTensorizer": 0.00043899999582208693, + "Canonicalizer": 0.014510000124573708, + "HoistCompute": 9.000000136438757e-05, + "IdentifyCrossPassTensors": 0.0001990000018849969, + "MemcastMotion": 0.00014400000509340316, + "PenguinizeFunctions": 0.00020300000323913991, + "PruneFunctions": 0.00043799998820759356, + "RemoveOptimizationBarriers": 0.0004330000083427876, + "ScatterMotion": 0.003582000033929944, + "TensorizerLegalizationPass": 0.00020799999765586108, + "VerifySupportedOps": 0.00046999999904073775, + "algsimp": 0.002443999983370304, + "batchnorm_expander": 0.0010989999864250422, + "boundary-marker-removal": 0.00046499999007210135, + "call-inliner": 0.0004149999876972288, + "canonicalize-boundary-marker": 0.0006760000251233578, + "collective-stream-id-checker": 9.600000339560211e-05, + "comparison-expander": 0.0006399999838322401, + "computation-deduplicator": 0.0005729999975301325, + "config-lowering": 0.00032900000223889947, + "constant_folding": 0.00021300000662449747, + "cse": 0.0009490000084042549, + "dce": 8.399999933317304e-05, + "dynamic-slice-transpose": 0.0003370000049471855, + "eliminate-redundant-compare": 0.00018200000340584666, + "emit-offloaded-dropout": 0.00041199999395757914, + "flatten-call-graph": 0.0004990000161342323, + "fuse-send-recv": 0.002171000000089407, + "hilo-conditional-to-select": 0.00016199999663513154, + "hilo::LegalizeAlias": 0.003091000020503998, + "hilo::NeuronInstCombine": 0.0004180000105407089, + "hilo::NeuronOpFusion": 0.00043799998820759356, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00031400000443682075, + "hilo::ScheduleFusion": 4.8999998398358e-05, + "hilo::SixtyFourHack": 0.00030300000798888505, + "hilo::VerifyAliasing": 0.00012599999899975955, + "hlo-mac-count": 0.00790099985897541, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 9.999999974752427e-07, + "io-layout-normalization": 0.0015510000521317124, + "legalize-ccops-for-tensorizer": 3.600000127335079e-05, + "legalize-compare": 0.00043799998820759356, + "lower-argminmax-custom-call": 0.0003330000035930425, + "map-inline": 0.0008950000046752393, + "metadata-naming": 0.0016899999463930726, + "mlir::detail::OpToOpPassAdaptor": 0.0001900000061141327, + "mlir::hlo::MhloToPyPenguin": 0.0982000008225441, + "mlir::mhlo::LowerComplexExtraPass": 0.004832000005990267, + "mlir::mhlo::LowerComplexPass": 0.0002789999998640269, + "native-to-custom-softmax": 0.0005840000230818987, + "native-to-custom-softmax-dx": 0.0005959999980404973, + "neuron-hlo-verifier": 0.025195999071002007, + "operand_upcaster": 0.0010349999647587538, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.07145500183105469, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 1.9999999949504854e-06, + "replace-minimum-constant": 0.0003389999910723418, + "reshape-mover": 9.500000305706635e-05, + "simplify-concat": 0.0027129999361932278, + "simplify-while-loops": 0.00010199999815085903, + "transform-variadic-reduce": 0.0007019999902695417, + "tuple-simplifier": 0.00018699999782256782, + "unpack-nested-aws-ntwsr": 0.00046300000394694507, + "unroll-while-loop": 1.4999999621068127e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0002338886260986328, + "DMALocalityOpt": 0.0002200603485107422, + "DMAProfiler": 0.0007832050323486328, + "DataStreaming": 0.00030303001403808594, + "DoNothing": 0.0001308917999267578, + "ExpandISAMacro": 0.0005805492401123047, + "FactorizeBlkDims": 0.0004668235778808594, + "InferPSumTensor": 0.0005860328674316406, + "InferSharedMemLoc": 0.0002865791320800781, + "InsertCoreBarrier": 0.00026535987854003906, + "LateLegalizeInst": 0.0004177093505859375, + "LateNeuronInstComb": 0.0006740093231201172, + "LegalizeSundaAccess": 0.0015828609466552734, + "LegalizeType": 0.0002980232238769531, + "LowerBroadcast": 0.00029921531677246094, + "LowerIntrinsics": 0.0002605915069580078, + "LowerTranspose": 0.0002872943878173828, + "NeuronInstComb": 0.0007240772247314453, + "NeuronLICM": 0.0004010200500488281, + "NeuronSimplifyPredicates": 0.003510713577270508, + "NeuronValueNumbering": 0.00045490264892578125, + "SFKVectorizer": 0.0027697086334228516, + "SimpleAllReduceTiling": 0.0002129077911376953, + "SimplifyNeuronTensor": 0.0005700588226318359, + "SpillPSum": 0.0005691051483154297, + "WeightCoalescing": 0.00026035308837890625 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 3.763896942138672, + "HloMacCount": 7368572928.0, + "Traffic": 3915395840.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 2.613478660583496, + "AffinePredicateResolution": 0.08293986320495605, + "AliasDependencyElimination": 0.003684520721435547, + "AliasDependencyInduction": 0.689274787902832, + "AliasDependencyReset": 0.7229092121124268, + "BFComputeCutting": 0.06791234016418457, + "BirCodeGenLoop": 1.860677719116211, + "CCOpFusion": 0.693291187286377, + "CanonicalizeDAGForPGTiling": 0.17862343788146973, + "CanonicalizeIR": 0.14400553703308105, + "CoalesceCCOp": 0.19327330589294434, + "CommuteConcat": 0.031813621520996094, + "DMALocalityOpt": 0.040947675704956055, + "DMAProfiler": 0.0807037353515625, + "DMATilingProfiler": 0.12095189094543457, + "DataLocalityOpt": 3.2372589111328125, + "DataStreaming": 0.1649169921875, + "DeConcat": 0.06088733673095703, + "DeadCodeElimination": 0.033219099044799805, + "DeadStoreElimination": 1.185746669769287, + "DelinearIndices": 0.43347668647766113, + "Delinearization": 0.16624140739440918, + "DelinearizeSPMD": 0.21023201942443848, + "DoNothing": 7.05718994140625e-05, + "DramToDramTranspose": 0.3406989574432373, + "DumpGraphAndMetadata": 0.13997292518615723, + "EliminateDivs": 0.20760846138000488, + "ExpandBatchNorm": 0.09476161003112793, + "ExpandISAMacro": 0.08798837661743164, + "FactorizeBlkDims": 0.519040584564209, + "FactorizeThreadAxesInFreeDims": 0.08941841125488281, + "FlattenMacroLoop": 0.10271286964416504, + "GenericAccessSimplifier": 0.028722524642944336, + "InferInitValue": 1.3908584117889404, + "InferIntrinsicOnCC": 0.3405766487121582, + "InferNeuronTensor": 1.702704668045044, + "InferNonlocalTensors": 5.50789213180542, + "InferPSumTensor": 1.3632442951202393, + "InferShardAxis": 9.253787994384766, + "InferSharedMemLoc": 0.11394119262695313, + "InlineNativeKernels": 0.0558774471282959, + "InsertCoreBarrier": 0.13022232055664063, + "InsertIOTransposes": 0.8883798122406006, + "InsertImplicitShardAxisBeforeISel": 0.41409993171691895, + "InsertLocalTransposes": 0.7429473400115967, + "InsertOffloadedTransposes": 0.13221502304077148, + "LICM": 0.12227439880371094, + "LateLegalizeInst": 0.14958691596984863, + "LateLegalizePostSplit": 0.10195636749267578, + "LateLowerReshapeOp": 0.03727078437805176, + "LateLowerTensorOp": 0.39714479446411133, + "LateNeuronInstComb": 1.1442666053771973, + "LayoutPreprocessing": 0.9332830905914307, + "LayoutPreprocessingAndAnalysis": 1.391254186630249, + "LayoutRequirementAnalysis": 0.44931793212890625, + "LegalizeCCOpLayout": 0.13326668739318848, + "LegalizeOpLevelAlias": 0.046427011489868164, + "LegalizePartitionReduce": 0.08784890174865723, + "LegalizeSundaAccess": 0.9682760238647461, + "LegalizeSundaMacro": 0.6998870372772217, + "LegalizeType": 0.14560770988464355, + "LocalLayoutOpt": 0.6880152225494385, + "LoopFusion": 0.38782477378845215, + "LoopSplitting": 0.029788732528686523, + "LowerBroadcast": 0.11038398742675781, + "LowerCCOpBlockAxis": 0.22148585319519043, + "LowerComplexBroadcast": 0.07951140403747559, + "LowerIntrinsics": 1.2981758117675781, + "LowerShardAxis": 0.22774982452392578, + "LowerTensorOp": 0.6523504257202148, + "LowerToSendRecv": 0.16421246528625488, + "LowerTranspose": 0.48946118354797363, + "MacroGeneration": 2.668619394302368, + "MaskPropagation": 0.11704516410827637, + "MemcpyElimination": 9.137748718261719, + "MutateDataType": 0.0424351692199707, + "NeuronAliasDependencyInduction": 0.019921302795410156, + "NeuronAliasDependencyReset": 0.035854339599609375, + "NeuronInstComb": 0.39587831497192383, + "NeuronLICM": 0.2722969055175781, + "NeuronLoopFusion": 1.5907442569732666, + "NeuronLoopInterchange": 0.06258201599121094, + "NeuronSimplifier": 0.507253885269165, + "NeuronSimplifyPredicates": 0.27733659744262695, + "NeuronValueNumbering": 0.12531447410583496, + "OptimizeAliasedCopyChain": 0.024678707122802734, + "OptimizeNKIKernels": 1.6666042804718018, + "PAGLayoutOpt": 15.41032886505127, + "PComputeCutting": 0.3097984790802002, + "PGLayoutTilingPipeline": 40.68231964111328, + "PGTiling": 5.982719421386719, + "PadElimination": 0.018638134002685547, + "ParAxesAnnotation": 14.657413482666016, + "PartialLoopFusion": 1.228170394897461, + "PartialSimdFusion": 0.8126676082611084, + "PerfectLoopNest": 0.07282352447509766, + "RecognizeOpIdiom": 0.12730193138122559, + "Recompute": 0.009931087493896484, + "RelaxPredicates": 0.11692571640014648, + "Rematerialization": 0.1635129451751709, + "RemoveShardedPartitionAxes": 1.2620294094085693, + "ReshapeWeights": 0.03252673149108887, + "ResolveAccessConflict": 0.19570350646972656, + "ResolveComplicatePredicates": 0.0764627456665039, + "RewriteReplicationMatmul": 0.046774864196777344, + "RewriteWeights": 0.0912783145904541, + "SFKVectorizer": 6.5128865242004395, + "ShardingPropagationAnalysis": 0.7250044345855713, + "SimpleAllReduceTiling": 0.06754159927368164, + "Simplifier": 0.10029053688049316, + "SimplifyMacroPredicates": 0.2846403121948242, + "SimplifyNeuronTensor": 0.350846529006958, + "SimplifySlice": 0.032215118408203125, + "SimplifyTensor": 0.2902982234954834, + "SpillPSum": 0.6651058197021484, + "SplitAPUnionSets": 0.5018737316131592, + "SplitAccGrp": 0.04875993728637695, + "StaticProfiler": 0.1320803165435791, + "StaticTransposeLocalTensor": 0.2515878677368164, + "SundaISel": 1.4164702892303467, + "TCTransform": 0.03308296203613281, + "TensorInitialization": 0.18078994750976563, + "TensorOpSimplifier": 0.7264657020568848, + "TensorOpTransform": 2.3855443000793457, + "TileCCOps": 0.2816429138183594, + "TilingProfiler": 0.4777710437774658, + "TransformConvOp": 0.13524365425109863, + "TritiumFusion": 1.4637632369995117, + "ValueNumbering": 0.09920430183410645, + "VectorizeDMA": 0.6684191226959229, + "VectorizeMatMult": 0.07972240447998047, + "WeightCoalescing": 0.06148171424865723, + "ZeroSizeTensorElimination": 0.00037407875061035156 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 121884, + "StaticProfiler::AifUb": 15.004831314086914, + "StaticProfiler::ArithmeticIntensityTensorizer": 38.543724060058594, + "StaticProfiler::AverageDmaLength": 3266.278076171875, + "StaticProfiler::AverageFractalPeUtilization": 98.25508117675781, + "StaticProfiler::AveragePartitionUtilization": 89.86397552490234, + "StaticProfiler::AveragePeUtilization": 76.03650665283203, + "StaticProfiler::DDRTransferBytes": 2254072788, + "StaticProfiler::InternalTransferBytes": 578006208, + "StaticProfiler::LoadExpanded": 550182, + "StaticProfiler::LocalizationEfficiency": 256.87542724609375, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 263.38134765625, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 558531, + "StaticProfiler::TotalDynamicInstancesCount": 148770, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 119986, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 82081, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 27301, + "TilingProfiler::PfTransposeInstructionsForIo": 23848, + "TilingProfiler::PfTransposeInstructionsForLocal": 1491, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 285, + "TilingProfiler::SimdInstructionsAfterTiling": 3136, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.0024733543395996094, + "DMALocalityOpt": 0.0018947124481201172, + "DMAProfiler": 0.002736330032348633, + "DataStreaming": 0.0041310787200927734, + "DoNothing": 0.00015783309936523438, + "ExpandISAMacro": 0.002727985382080078, + "FactorizeBlkDims": 0.0073091983795166016, + "InferPSumTensor": 0.007611751556396484, + "InferSharedMemLoc": 0.0020210742950439453, + "InsertCoreBarrier": 0.002227306365966797, + "LateLegalizeInst": 0.0050678253173828125, + "LateNeuronInstComb": 0.006031990051269531, + "LegalizeSundaAccess": 0.008699417114257813, + "LegalizeType": 0.005860805511474609, + "LowerBroadcast": 0.0029633045196533203, + "LowerIntrinsics": 0.0029659271240234375, + "LowerTranspose": 0.002403736114501953, + "NeuronInstComb": 0.006084442138671875, + "NeuronLICM": 0.006303071975708008, + "NeuronSimplifyPredicates": 0.002488374710083008, + "NeuronValueNumbering": 0.002725362777709961, + "SFKVectorizer": 0.019837617874145508, + "SimpleAllReduceTiling": 0.002407073974609375, + "SimplifyNeuronTensor": 0.04744243621826172, + "SpillPSum": 0.013517379760742188, + "WeightCoalescing": 0.002378225326538086 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk3/graph.neff b/token_generation_model/_tp0_bk3/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..3362b302960e9ba17a300c5cdea01194e6ec2f0a --- /dev/null +++ b/token_generation_model/_tp0_bk3/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7725ff2a45b5375e7597bb8ce00907e416a0f70c0e8323f6633b51ad821fe52e +size 7465984 diff --git a/token_generation_model/_tp0_bk3/log-neuron-cc.txt b/token_generation_model/_tp0_bk3/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b4f0033186e9aaddb6f6bcccf8dbe4870fe97d5 --- /dev/null +++ b/token_generation_model/_tp0_bk3/log-neuron-cc.txt @@ -0,0 +1,4604 @@ +2025-11-04T21:38:36Z INFO 8806 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:36Z INFO 8806 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:36Z INFO 8854 [root]: XLA detected +2025-11-04T21:38:36Z INFO 8854 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:36Z INFO 8854 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3 +2025-11-04T21:38:36Z INFO 8854 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:36Z INFO 8854 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8854 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:36Z INFO 8854 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:36Z INFO 8854 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:36Z INFO 8854 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8854 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:37Z INFO 8854 [job.HLOToTensorizer.0]: Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate parameter reduce reshape rng scatter select sine slice subtract transpose tuple +2025-11-04 21:38:37.007094: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:88] Could not open file debug_info_hlo_partitions.json +2025-11-04 21:38:37.017636: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.10701 = tuple(%reshape.4385, %scatter.9929, %scatter.9944, %scatter.9957, %scatter.9972, %scatter.9985, %scatter.10000, %scatter.10013, %scatter.10028, %scatter.10041, %scatter.10056, %scatter.10069, %scatter.10084, %scatter.10097, %scatter.10112, %scatter.10125, %scatter.10140, %scatter.10153, %scatter.10168, %scatter.10181, %scatter.10196, %scatter.10209, %scatter.10224, %scatter.10237, %scatter.10252, %scatter.10265, %scatter.10280, %scatter.10293, %scatter.10308, %scatter.10321, %scatter.10336... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:37Z INFO 8854 [job.HLOToTensorizer.0]: IR signature: 3cd77593d3d64fccdad705fb50470d45a4176222615310965224f7624529e1b8 for sg0000/HLOToTensorizer +2025-11-04T21:38:37Z INFO 8854 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:37Z INFO 8854 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:37Z INFO 8854 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:37Z INFO 8854 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:37Z INFO 8854 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:37Z INFO 8854 [job.Frontend.0]: Start model loading +2025-11-04T21:38:37Z INFO 8854 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:37Z INFO 8854 [job.Frontend.0]: Num jobs: 1 +2025-11-04T21:38:37Z USER 8854 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:37Z INFO 8854 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-11-04T21:38:37Z INFO 8854 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-11-04T21:38:39Z INFO 8854 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.046 seconds +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.025 seconds +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.182 seconds +2025-11-04T21:38:39Z INFO 8854 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.135 seconds +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.652 seconds +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.006 seconds +2025-11-04T21:38:40Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:41Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.542 seconds +2025-11-04T21:38:41Z INFO 8854 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.577 seconds +2025-11-04T21:38:41Z INFO 8854 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:41Z INFO 8854 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8854 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.133 seconds +2025-11-04T21:38:41Z INFO 8854 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.726 seconds +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.144 seconds +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.076 seconds +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.083 seconds +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.208 seconds +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.069 seconds +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:42Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.323 seconds +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.328 seconds +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.242 seconds +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.895 seconds +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.113 seconds +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.091 seconds +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:43Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.080 seconds +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.081 seconds +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.095 seconds +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.062 seconds +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.048 seconds +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:44Z INFO 8854 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:46Z INFO 8854 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 2.066 seconds +2025-11-04T21:38:46Z INFO 8854 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:46Z INFO 8854 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.318 seconds +2025-11-04T21:38:46Z INFO 8854 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8854 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 2.386 seconds +2025-11-04T21:38:46Z INFO 8854 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.397 seconds +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.007 seconds +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.689 seconds +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.723 seconds +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:47Z INFO 8854 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:56Z INFO 8854 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 8.836 seconds +2025-11-04T21:38:56Z INFO 8854 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:56Z INFO 8854 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.301 seconds +2025-11-04T21:38:56Z INFO 8854 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:56Z INFO 8854 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 9.138 seconds +2025-11-04T21:38:56Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:57Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:57Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.936 seconds +2025-11-04T21:38:57Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:58Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.315 seconds +2025-11-04T21:38:58Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:58Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.228 seconds +2025-11-04T21:38:58Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:58Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.386 seconds +2025-11-04T21:38:58Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.417 seconds +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.318 seconds +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.164 seconds +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.152 seconds +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.112 seconds +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.109 seconds +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.375 seconds +2025-11-04T21:38:59Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:00Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 1.006 seconds +2025-11-04T21:39:00Z INFO 8854 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:02Z INFO 8854 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:02Z INFO 8854 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 2.018 seconds +2025-11-04T21:39:02Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:02Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.189 seconds +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.192 seconds +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LICM]: LICM finished after 0.124 seconds +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.256 seconds +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.217 seconds +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.118 seconds +2025-11-04T21:39:03Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.233 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.581 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.032 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LICM]: LICM finished after 0.060 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.115 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.109 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.225 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.127 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LICM]: LICM finished after 0.058 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.019 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.144 seconds +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:04Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.185 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.196 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.388 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.030 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.105 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.105 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/LICM]: LICM finished after 0.055 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.099 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.033 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.031 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.032 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.127 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.127 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.096 seconds +2025-11-04T21:39:05Z INFO 8854 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.186 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.010 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.032 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.032 seconds +2025-11-04T21:39:07Z INFO 8854 [Tensorizer]: After optimization: 958 statements +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.042 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.029 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.100 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.100 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=32768 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 8) %'all_gather.1' = AllGatherOp-402 AllGather_add(bfloat16 (1024, 8) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.50 | hlo_id: 50 | , id = 402 +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: in float32 (512, 8) %'all_gather.2' = AllGatherOp-9247 AllGather_add(float32 (256, 8) %'transpose.537', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9754 | hlo_id: 9754 | , id = 9247 +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=16384 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: in uint32 (512, 8) %'all_gather.3' = AllGatherOp-9263 AllGather_add(uint32 (256, 8) %'transpose.538', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9893 | hlo_id: 9893 | , id = 9263 +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.282 seconds +2025-11-04T21:39:07Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.547 seconds +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.135 seconds +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.336 seconds +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.033 seconds +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.037 seconds +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.341 seconds +2025-11-04T21:39:08Z INFO 8854 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_1 +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_1 finished after 0.030 seconds +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.196 seconds +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/LICM]: LICM finished after 0.061 seconds +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.688 seconds +2025-11-04T21:39:09Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.387 seconds +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.245 seconds +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:39:10Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.136 seconds +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.933 seconds +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.449 seconds +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.391 seconds +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:39:11Z INFO 8854 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:13Z INFO 8854 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:17Z INFO 8854 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:39:17Z INFO 8854 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 5.508 seconds +2025-11-04T21:39:17Z INFO 8854 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:39:17Z INFO 8854 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:39:17Z INFO 8854 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 14.657 seconds +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.743 seconds +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 15.410 seconds +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:39:32Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:33Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8854 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.166 seconds +2025-11-04T21:39:33Z INFO 8854 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8854 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.210 seconds +2025-11-04T21:39:33Z INFO 8854 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:39:33Z INFO 8854 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.725 seconds +2025-11-04T21:39:33Z INFO 8854 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:39:40Z INFO 8854 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:39:40Z INFO 8854 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 1159 +total number of sharded dags: 408 + +total bytes transferred from input, output, non local tensors: 2197268808 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 2194636932 +% bytes transferred with 2x bandwidths: 99.88 + +NC0 FLOPs: 7370708121 +NC1 FLOPs: 7363368832 +% FLOPs sharded: 99.95 + + +Shard dim: 1024, Number of dags: 198 +Matmuls sharded with this dim: +[2,1024(s)] @ [1024(s),128] = [2,128] Number of occurrences: 28 +[2,2,64] @ [2,64,1024(s)] = [2,1024(s)] Number of occurrences: 28 + + +Shard dim: 2, Number of dags: 196 +Matmuls sharded with this dim: +[8,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [8,8,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,128] = [8,2,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,2,64] = [8,2,2,2,2,64] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,64] = [8,2,2,2,64] Number of occurrences: 28 +[8,2,2,2,128] @ [2,2,2,128,2(s),2,4,128] = [8,2(s),2,4,128] Number of occurrences: 28 +[8,2,8,128] @ [2,8,128,2(s),6,2,128] = [8,2(s),6,2,128] Number of occurrences: 56 + + +Shard dim: 256, Number of dags: 10 +Matmuls sharded with this dim: + + +Shard dim: 8, Number of dags: 2 +Matmuls sharded with this dim: + + +Shard dim: 512, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[8,2,8,128] @ [2,8,128,75968(s)] = [8,75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:39:41Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:41Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:41Z INFO 8854 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.433 seconds +2025-11-04T21:39:41Z INFO 8854 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 1.262 seconds +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 9.254 seconds +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.117 seconds +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.179 seconds +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.221 seconds +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:39:43Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (79, 'AG3736'), (80, 'AG3735'), (218, 'AG3727'), (474, 'AG3726'), (274, 'AG3733')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9810 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (83, 'AG3752'), (84, 'AG3751'), (218, 'AG3727'), (474, 'AG3726'), (272, 'AG3749')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10061 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (87, 'AG3768'), (88, 'AG3767'), (218, 'AG3727'), (474, 'AG3726'), (270, 'AG3765')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10312 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (91, 'AG3784'), (92, 'AG3783'), (218, 'AG3727'), (474, 'AG3726'), (268, 'AG3781')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (95, 'AG3800'), (96, 'AG3799'), (218, 'AG3727'), (474, 'AG3726'), (266, 'AG3797')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10814 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (99, 'AG3816'), (100, 'AG3815'), (218, 'AG3727'), (474, 'AG3726'), (264, 'AG3813')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11065 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (103, 'AG3832'), (104, 'AG3831'), (218, 'AG3727'), (474, 'AG3726'), (262, 'AG3829')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11316 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (107, 'AG3848'), (108, 'AG3847'), (218, 'AG3727'), (474, 'AG3726'), (260, 'AG3845')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (111, 'AG3864'), (112, 'AG3863'), (218, 'AG3727'), (474, 'AG3726'), (258, 'AG3861')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11818 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (115, 'AG3880'), (116, 'AG3879'), (218, 'AG3727'), (474, 'AG3726'), (256, 'AG3877')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12069 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (119, 'AG3896'), (120, 'AG3895'), (218, 'AG3727'), (474, 'AG3726'), (254, 'AG3893')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12320 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (123, 'AG3912'), (124, 'AG3911'), (218, 'AG3727'), (474, 'AG3726'), (252, 'AG3909')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12571 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (127, 'AG3928'), (128, 'AG3927'), (218, 'AG3727'), (474, 'AG3726'), (250, 'AG3925')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12822 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (131, 'AG3944'), (132, 'AG3943'), (218, 'AG3727'), (474, 'AG3726'), (248, 'AG3941')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13073 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (135, 'AG3960'), (136, 'AG3959'), (218, 'AG3727'), (474, 'AG3726'), (246, 'AG3957')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13324 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (139, 'AG3976'), (140, 'AG3975'), (218, 'AG3727'), (474, 'AG3726'), (244, 'AG3973')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13575 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (143, 'AG3992'), (144, 'AG3991'), (218, 'AG3727'), (474, 'AG3726'), (242, 'AG3989')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13826 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (147, 'AG4008'), (148, 'AG4007'), (218, 'AG3727'), (474, 'AG3726'), (240, 'AG4005')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14077 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (151, 'AG4024'), (152, 'AG4023'), (218, 'AG3727'), (474, 'AG3726'), (238, 'AG4021')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14328 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (155, 'AG4040'), (156, 'AG4039'), (218, 'AG3727'), (474, 'AG3726'), (236, 'AG4037')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (159, 'AG4056'), (160, 'AG4055'), (218, 'AG3727'), (474, 'AG3726'), (234, 'AG4053')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14830 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (163, 'AG4072'), (164, 'AG4071'), (218, 'AG3727'), (474, 'AG3726'), (232, 'AG4069')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15081 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (167, 'AG4088'), (168, 'AG4087'), (218, 'AG3727'), (474, 'AG3726'), (230, 'AG4085')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15332 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (171, 'AG4104'), (172, 'AG4103'), (218, 'AG3727'), (474, 'AG3726'), (228, 'AG4101')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (175, 'AG4120'), (176, 'AG4119'), (218, 'AG3727'), (474, 'AG3726'), (226, 'AG4117')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15834 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (179, 'AG4136'), (180, 'AG4135'), (218, 'AG3727'), (474, 'AG3726'), (224, 'AG4133')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16085 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (183, 'AG4152'), (184, 'AG4151'), (218, 'AG3727'), (474, 'AG3726'), (222, 'AG4149')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16336 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(8, 2, 2, 4, 2, 512, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (187, 'AG4168'), (188, 'AG4167'), (218, 'AG3727'), (474, 'AG3726'), (220, 'AG4165')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23397 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(78, 'AG3741'), (273, 'AG3740'), (79, 'AG3736'), (80, 'AG3735'), (81, 'AG3734'), (358, 'AG3739'), (470, 'AG3738')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (471, 'AG3737')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23387 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(79, 'AG3736'), (191, 'AG3731'), (80, 'AG3735'), (81, 'AG3734'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(469, 'AG3742'), (74, 'AG3744'), (357, 'AG3743')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(37, 'AG3748'), (1, 'AG3745'), (356, 'AG3747'), (468, 'AG3746')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23400 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23398 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23399 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23412 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(82, 'AG3757'), (271, 'AG3756'), (83, 'AG3752'), (84, 'AG3751'), (85, 'AG3750'), (355, 'AG3755'), (466, 'AG3754')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (467, 'AG3753')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23402 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(83, 'AG3752'), (191, 'AG3731'), (84, 'AG3751'), (85, 'AG3750'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(465, 'AG3758'), (75, 'AG3760'), (354, 'AG3759')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(38, 'AG3764'), (2, 'AG3761'), (353, 'AG3763'), (464, 'AG3762')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23415 of IO tensor {'CrossPassTensor': ''}bfloat16 %input80|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23413 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23414 of IO tensor {'CrossPassTensor': ''}bfloat16 %input82|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23427 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(86, 'AG3773'), (269, 'AG3772'), (87, 'AG3768'), (88, 'AG3767'), (89, 'AG3766'), (352, 'AG3771'), (462, 'AG3770')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (463, 'AG3769')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23417 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(87, 'AG3768'), (191, 'AG3731'), (88, 'AG3767'), (89, 'AG3766'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(461, 'AG3774'), (76, 'AG3776'), (351, 'AG3775')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(39, 'AG3780'), (3, 'AG3777'), (350, 'AG3779'), (460, 'AG3778')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23430 of IO tensor {'CrossPassTensor': ''}bfloat16 %input91|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23428 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23429 of IO tensor {'CrossPassTensor': ''}bfloat16 %input93|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(90, 'AG3789'), (267, 'AG3788'), (91, 'AG3784'), (92, 'AG3783'), (93, 'AG3782'), (349, 'AG3787'), (458, 'AG3786')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (459, 'AG3785')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23432 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(91, 'AG3784'), (191, 'AG3731'), (92, 'AG3783'), (93, 'AG3782'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(457, 'AG3790'), (192, 'AG3792'), (348, 'AG3791')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(40, 'AG3796'), (4, 'AG3793'), (347, 'AG3795'), (456, 'AG3794')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23457 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(94, 'AG3805'), (265, 'AG3804'), (95, 'AG3800'), (96, 'AG3799'), (97, 'AG3798'), (346, 'AG3803'), (454, 'AG3802')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (455, 'AG3801')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(95, 'AG3800'), (191, 'AG3731'), (96, 'AG3799'), (97, 'AG3798'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(453, 'AG3806'), (193, 'AG3808'), (345, 'AG3807')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(41, 'AG3812'), (5, 'AG3809'), (344, 'AG3811'), (452, 'AG3810')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23472 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(98, 'AG3821'), (263, 'AG3820'), (99, 'AG3816'), (100, 'AG3815'), (101, 'AG3814'), (343, 'AG3819'), (450, 'AG3818')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (451, 'AG3817')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23462 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(99, 'AG3816'), (191, 'AG3731'), (100, 'AG3815'), (101, 'AG3814'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(449, 'AG3822'), (194, 'AG3824'), (342, 'AG3823')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(42, 'AG3828'), (6, 'AG3825'), (341, 'AG3827'), (448, 'AG3826')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23473 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23487 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(102, 'AG3837'), (261, 'AG3836'), (103, 'AG3832'), (104, 'AG3831'), (105, 'AG3830'), (340, 'AG3835'), (446, 'AG3834')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (447, 'AG3833')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23477 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(103, 'AG3832'), (191, 'AG3731'), (104, 'AG3831'), (105, 'AG3830'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(445, 'AG3838'), (195, 'AG3840'), (339, 'AG3839')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(43, 'AG3844'), (7, 'AG3841'), (338, 'AG3843'), (444, 'AG3842')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23488 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23502 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(106, 'AG3853'), (259, 'AG3852'), (107, 'AG3848'), (108, 'AG3847'), (109, 'AG3846'), (337, 'AG3851'), (442, 'AG3850')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (443, 'AG3849')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23492 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(107, 'AG3848'), (191, 'AG3731'), (108, 'AG3847'), (109, 'AG3846'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(441, 'AG3854'), (196, 'AG3856'), (336, 'AG3855')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(44, 'AG3860'), (8, 'AG3857'), (335, 'AG3859'), (440, 'AG3858')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23503 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23517 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(110, 'AG3869'), (257, 'AG3868'), (111, 'AG3864'), (112, 'AG3863'), (113, 'AG3862'), (334, 'AG3867'), (438, 'AG3866')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (439, 'AG3865')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23507 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(111, 'AG3864'), (191, 'AG3731'), (112, 'AG3863'), (113, 'AG3862'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(437, 'AG3870'), (197, 'AG3872'), (333, 'AG3871')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG3876'), (9, 'AG3873'), (332, 'AG3875'), (436, 'AG3874')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23518 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23532 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(114, 'AG3885'), (255, 'AG3884'), (115, 'AG3880'), (116, 'AG3879'), (117, 'AG3878'), (331, 'AG3883'), (434, 'AG3882')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (435, 'AG3881')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23522 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(115, 'AG3880'), (191, 'AG3731'), (116, 'AG3879'), (117, 'AG3878'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(433, 'AG3886'), (198, 'AG3888'), (330, 'AG3887')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(46, 'AG3892'), (10, 'AG3889'), (329, 'AG3891'), (432, 'AG3890')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23533 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23547 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(118, 'AG3901'), (253, 'AG3900'), (119, 'AG3896'), (120, 'AG3895'), (121, 'AG3894'), (328, 'AG3899'), (430, 'AG3898')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (431, 'AG3897')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23537 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(119, 'AG3896'), (191, 'AG3731'), (120, 'AG3895'), (121, 'AG3894'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(429, 'AG3902'), (199, 'AG3904'), (327, 'AG3903')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(47, 'AG3908'), (11, 'AG3905'), (326, 'AG3907'), (428, 'AG3906')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23548 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(122, 'AG3917'), (251, 'AG3916'), (123, 'AG3912'), (124, 'AG3911'), (125, 'AG3910'), (325, 'AG3915'), (426, 'AG3914')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (427, 'AG3913')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23552 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(123, 'AG3912'), (191, 'AG3731'), (124, 'AG3911'), (125, 'AG3910'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(425, 'AG3918'), (200, 'AG3920'), (324, 'AG3919')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(48, 'AG3924'), (12, 'AG3921'), (323, 'AG3923'), (424, 'AG3922')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23577 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(126, 'AG3933'), (249, 'AG3932'), (127, 'AG3928'), (128, 'AG3927'), (129, 'AG3926'), (322, 'AG3931'), (422, 'AG3930')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (423, 'AG3929')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(127, 'AG3928'), (191, 'AG3731'), (128, 'AG3927'), (129, 'AG3926'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(421, 'AG3934'), (201, 'AG3936'), (321, 'AG3935')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(49, 'AG3940'), (13, 'AG3937'), (320, 'AG3939'), (420, 'AG3938')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23578 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(130, 'AG3949'), (247, 'AG3948'), (131, 'AG3944'), (132, 'AG3943'), (133, 'AG3942'), (319, 'AG3947'), (418, 'AG3946')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (419, 'AG3945')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23582 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG3944'), (191, 'AG3731'), (132, 'AG3943'), (133, 'AG3942'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(417, 'AG3950'), (202, 'AG3952'), (318, 'AG3951')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(50, 'AG3956'), (14, 'AG3953'), (317, 'AG3955'), (416, 'AG3954')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23607 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(134, 'AG3965'), (245, 'AG3964'), (135, 'AG3960'), (136, 'AG3959'), (137, 'AG3958'), (316, 'AG3963'), (414, 'AG3962')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (415, 'AG3961')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23597 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(135, 'AG3960'), (191, 'AG3731'), (136, 'AG3959'), (137, 'AG3958'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(413, 'AG3966'), (203, 'AG3968'), (315, 'AG3967')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(51, 'AG3972'), (15, 'AG3969'), (314, 'AG3971'), (412, 'AG3970')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23608 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23622 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(138, 'AG3981'), (243, 'AG3980'), (139, 'AG3976'), (140, 'AG3975'), (141, 'AG3974'), (313, 'AG3979'), (410, 'AG3978')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (411, 'AG3977')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23612 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(139, 'AG3976'), (191, 'AG3731'), (140, 'AG3975'), (141, 'AG3974'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(409, 'AG3982'), (204, 'AG3984'), (312, 'AG3983')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(52, 'AG3988'), (16, 'AG3985'), (311, 'AG3987'), (408, 'AG3986')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23623 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(142, 'AG3997'), (241, 'AG3996'), (143, 'AG3992'), (144, 'AG3991'), (145, 'AG3990'), (310, 'AG3995'), (406, 'AG3994')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (407, 'AG3993')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23627 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(143, 'AG3992'), (191, 'AG3731'), (144, 'AG3991'), (145, 'AG3990'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(405, 'AG3998'), (205, 'AG4000'), (309, 'AG3999')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(53, 'AG4004'), (17, 'AG4001'), (308, 'AG4003'), (404, 'AG4002')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23652 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(146, 'AG4013'), (239, 'AG4012'), (147, 'AG4008'), (148, 'AG4007'), (149, 'AG4006'), (307, 'AG4011'), (402, 'AG4010')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (403, 'AG4009')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23642 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(147, 'AG4008'), (191, 'AG3731'), (148, 'AG4007'), (149, 'AG4006'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(401, 'AG4014'), (206, 'AG4016'), (306, 'AG4015')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(54, 'AG4020'), (18, 'AG4017'), (305, 'AG4019'), (400, 'AG4018')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23667 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(150, 'AG4029'), (237, 'AG4028'), (151, 'AG4024'), (152, 'AG4023'), (153, 'AG4022'), (304, 'AG4027'), (398, 'AG4026')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (399, 'AG4025')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(151, 'AG4024'), (191, 'AG3731'), (152, 'AG4023'), (153, 'AG4022'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(397, 'AG4030'), (207, 'AG4032'), (303, 'AG4031')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(55, 'AG4036'), (19, 'AG4033'), (302, 'AG4035'), (396, 'AG4034')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23668 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23682 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(154, 'AG4045'), (235, 'AG4044'), (155, 'AG4040'), (156, 'AG4039'), (157, 'AG4038'), (301, 'AG4043'), (394, 'AG4042')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (395, 'AG4041')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(155, 'AG4040'), (191, 'AG3731'), (156, 'AG4039'), (157, 'AG4038'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(393, 'AG4046'), (208, 'AG4048'), (300, 'AG4047')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(56, 'AG4052'), (20, 'AG4049'), (299, 'AG4051'), (392, 'AG4050')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23683 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(158, 'AG4061'), (233, 'AG4060'), (159, 'AG4056'), (160, 'AG4055'), (161, 'AG4054'), (298, 'AG4059'), (390, 'AG4058')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (391, 'AG4057')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23687 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(159, 'AG4056'), (191, 'AG3731'), (160, 'AG4055'), (161, 'AG4054'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(389, 'AG4062'), (209, 'AG4064'), (297, 'AG4063')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(57, 'AG4068'), (21, 'AG4065'), (296, 'AG4067'), (388, 'AG4066')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23712 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(162, 'AG4077'), (231, 'AG4076'), (163, 'AG4072'), (164, 'AG4071'), (165, 'AG4070'), (295, 'AG4075'), (386, 'AG4074')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (387, 'AG4073')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(163, 'AG4072'), (191, 'AG3731'), (164, 'AG4071'), (165, 'AG4070'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(385, 'AG4078'), (210, 'AG4080'), (294, 'AG4079')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(58, 'AG4084'), (22, 'AG4081'), (293, 'AG4083'), (384, 'AG4082')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23727 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(166, 'AG4093'), (229, 'AG4092'), (167, 'AG4088'), (168, 'AG4087'), (169, 'AG4086'), (292, 'AG4091'), (382, 'AG4090')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (383, 'AG4089')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(167, 'AG4088'), (191, 'AG3731'), (168, 'AG4087'), (169, 'AG4086'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(381, 'AG4094'), (211, 'AG4096'), (291, 'AG4095')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(59, 'AG4100'), (23, 'AG4097'), (290, 'AG4099'), (380, 'AG4098')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23742 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(170, 'AG4109'), (227, 'AG4108'), (171, 'AG4104'), (172, 'AG4103'), (173, 'AG4102'), (289, 'AG4107'), (378, 'AG4106')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (379, 'AG4105')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23732 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(171, 'AG4104'), (191, 'AG3731'), (172, 'AG4103'), (173, 'AG4102'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(377, 'AG4110'), (212, 'AG4112'), (288, 'AG4111')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(60, 'AG4116'), (24, 'AG4113'), (287, 'AG4115'), (376, 'AG4114')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23743 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23757 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(174, 'AG4125'), (225, 'AG4124'), (175, 'AG4120'), (176, 'AG4119'), (177, 'AG4118'), (286, 'AG4123'), (374, 'AG4122')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (375, 'AG4121')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23747 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(175, 'AG4120'), (191, 'AG3731'), (176, 'AG4119'), (177, 'AG4118'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(373, 'AG4126'), (213, 'AG4128'), (285, 'AG4127')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(61, 'AG4132'), (25, 'AG4129'), (284, 'AG4131'), (372, 'AG4130')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23758 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23772 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(178, 'AG4141'), (223, 'AG4140'), (179, 'AG4136'), (180, 'AG4135'), (181, 'AG4134'), (283, 'AG4139'), (370, 'AG4138')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (371, 'AG4137')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23762 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(179, 'AG4136'), (191, 'AG3731'), (180, 'AG4135'), (181, 'AG4134'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(369, 'AG4142'), (214, 'AG4144'), (282, 'AG4143')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(62, 'AG4148'), (26, 'AG4145'), (281, 'AG4147'), (368, 'AG4146')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23773 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23787 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(182, 'AG4157'), (221, 'AG4156'), (183, 'AG4152'), (184, 'AG4151'), (185, 'AG4150'), (280, 'AG4155'), (366, 'AG4154')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (367, 'AG4153')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23777 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(183, 'AG4152'), (191, 'AG3731'), (184, 'AG4151'), (185, 'AG4150'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(365, 'AG4158'), (215, 'AG4160'), (279, 'AG4159')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(63, 'AG4164'), (27, 'AG4161'), (278, 'AG4163'), (364, 'AG4162')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23788 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23802 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(186, 'AG4173'), (219, 'AG4172'), (187, 'AG4168'), (188, 'AG4167'), (189, 'AG4166'), (277, 'AG4171'), (362, 'AG4170')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (363, 'AG4169')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23792 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(187, 'AG4168'), (191, 'AG3731'), (188, 'AG4167'), (189, 'AG4166'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(361, 'AG4174'), (216, 'AG4176'), (276, 'AG4175')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(64, 'AG4180'), (28, 'AG4177'), (275, 'AG4179'), (360, 'AG4178')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23803 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23339 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(190, 'AG4182'), (217, 'AG4181'), (191, 'AG3731')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23807 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23385 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23386 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16621 of IO tensor non_local float32 %get_tuple_element.3(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23361 of IO tensor non_local uint32 %get_tuple_element.4(8, 2, 128) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16596 of IO tensor non_local int32 %gather.2|NC|(8, 256) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16639 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16656 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 2.613 seconds +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.252 seconds +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.310 seconds +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.068 seconds +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.030 seconds +2025-11-04T21:39:46Z INFO 8854 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:39:49Z INFO 8854 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:39:49Z INFO 8854 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.669 seconds +2025-11-04T21:39:49Z INFO 8854 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 5.983 seconds +2025-11-04T21:39:49Z INFO 8854 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:39:50Z INFO 8854 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:39:50Z INFO 8854 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.888 seconds +2025-11-04T21:39:50Z INFO 8854 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:39:50Z INFO 8854 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:39:50Z INFO 8854 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:39:50Z INFO 8854 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.132 seconds +2025-11-04T21:39:50Z INFO 8854 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.341 seconds +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 40.682 seconds +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.478 seconds +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.319 seconds +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:39:51Z INFO 8854 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 1.632 seconds +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.070 seconds +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.703 seconds +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.314 seconds +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.314 seconds +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/LICM]: LICM finished after 0.087 seconds +2025-11-04T21:39:53Z INFO 8854 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.047 seconds +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.183 seconds +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.124 seconds +2025-11-04T21:39:54Z INFO 8854 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 3.237 seconds +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 594: transpose_128x8 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 512: transpose_128x64 +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.121 seconds +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:57Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:58Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.358 seconds +2025-11-04T21:39:58Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:58Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.359 seconds +2025-11-04T21:39:58Z INFO 8854 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:39:58Z INFO 8854 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:39:58Z INFO 8854 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.700 seconds +2025-11-04T21:39:58Z INFO 8854 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:39:59Z INFO 8854 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:39:59Z INFO 8854 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.414 seconds +2025-11-04T21:39:59Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:59Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:59Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.518 seconds +2025-11-04T21:39:59Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.521 seconds +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 1.040 seconds +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.073 seconds +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.225 seconds +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.091 seconds +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.033 seconds +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.103 seconds +2025-11-04T21:40:00Z INFO 8854 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:40:01Z INFO 8854 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8854 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.285 seconds +2025-11-04T21:40:01Z INFO 8854 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:40:02Z INFO 8854 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:40:02Z INFO 8854 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.391 seconds +2025-11-04T21:40:02Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:40:02Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.507 seconds +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.507 seconds +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.076 seconds +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.290 seconds +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/LICM]: LICM finished after 0.122 seconds +2025-11-04T21:40:03Z INFO 8854 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.416 seconds +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.004 seconds +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.020 seconds +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.036 seconds +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:40:04Z INFO 8854 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.080 seconds +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.074 seconds +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.752 seconds +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:40:05Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:40:06Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.560 seconds +2025-11-04T21:40:06Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:40:06Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.274 seconds +2025-11-04T21:40:06Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:40:06Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.251 seconds +2025-11-04T21:40:06Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.252 seconds +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_4 +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_4 finished after 0.251 seconds +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 1.591 seconds +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.063 seconds +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.355 seconds +2025-11-04T21:40:07Z INFO 8854 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:08Z INFO 8854 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:40:08Z INFO 8854 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.519 seconds +2025-11-04T21:40:08Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:08Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 1.053 seconds +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.202 seconds +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_2 +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_2 finished after 0.180 seconds +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.447 seconds +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.125 seconds +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:09Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.200 seconds +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.188 seconds +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.396 seconds +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.092 seconds +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:40:10Z INFO 8854 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.618 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.050 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.668 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.260 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.088 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.060 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.061 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.089 seconds +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:40:11Z INFO 8854 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:40:12Z INFO 8854 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.812 seconds +2025-11-04T21:40:12Z INFO 8854 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:40:12Z INFO 8854 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.813 seconds +2025-11-04T21:40:12Z INFO 8854 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:40:13Z INFO 8854 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:40:13Z INFO 8854 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.464 seconds +2025-11-04T21:40:13Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:13Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.559 seconds +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.559 seconds +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.080 seconds +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:40:14Z INFO 8854 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:40:15Z INFO 8854 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 1.228 seconds +2025-11-04T21:40:15Z INFO 8854 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:40:15Z INFO 8854 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 1.228 seconds +2025-11-04T21:40:15Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.187 seconds +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.489 seconds +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.110 seconds +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:16Z INFO 8854 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.956 seconds +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.180 seconds +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 1.144 seconds +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.049 seconds +2025-11-04T21:40:17Z INFO 8854 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:18Z INFO 8854 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:18Z INFO 8854 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.665 seconds +2025-11-04T21:40:18Z INFO 8854 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:19Z INFO 8854 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:40:19Z INFO 8854 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.298 seconds +2025-11-04T21:40:19Z INFO 8854 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:40:19Z INFO 8854 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:40:19Z INFO 8854 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.056 seconds +2025-11-04T21:40:19Z INFO 8854 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:20Z INFO 8854 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8854 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.146 seconds +2025-11-04T21:40:20Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:20Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8854 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.272 seconds +2025-11-04T21:40:20Z INFO 8854 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:20Z INFO 8854 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.725 seconds +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.637 seconds +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.363 seconds +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.061 seconds +2025-11-04T21:40:21Z INFO 8854 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:22Z INFO 8854 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:22Z INFO 8854 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.968 seconds +2025-11-04T21:40:22Z INFO 8854 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:40:22Z INFO 8854 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:40:22Z INFO 8854 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.117 seconds +2025-11-04T21:40:22Z INFO 8854 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.181 seconds +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.277 seconds +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.088 seconds +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.038 seconds +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.351 seconds +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.041 seconds +2025-11-04T21:40:23Z INFO 8854 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:24Z INFO 8854 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:40:24Z INFO 8854 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.165 seconds +2025-11-04T21:40:24Z INFO 8854 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:28Z INFO 8854 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 2.161 seconds +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.246 seconds +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 6.513 seconds +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.150 seconds +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.193 seconds +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.068 seconds +2025-11-04T21:40:30Z INFO 8854 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.130 seconds +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 12.382% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'38147.56431'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1,i2.16,i1.128] # id=56430, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_38147 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 230.909us (2.344MiB, est bw: 10.643GB/s, 1.877% of tot. time) for float32<8 x 128> non_local float32 (8, 2, 37984) %'convert.656'[i1.8,i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1] = store float32<8 x 128> TongaSB partitions[2] float32 (2, 297, 8, 128) %'38660.56441'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i1.8,i0.128] # id=56439, src_id=None, , instances=600 # dl = tensor_op_name: convert.656_pftranspose_38660 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i1.8];[i0.128]] -> [[i1.8];[i0.128]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 138.776us (32.031MiB, est bw: 242.024GB/s, 1.128% of tot. time) for bfloat16<128 x 8200> TongaSB partitions[2] bfloat16 (2, 8, 128, 8200) %'all_gather.1_nostride_60851'(init=0.0)[i242_0_0_42945,T_i2,i0.128,i1.8200] = load bfloat16<128 x 8200> non_local bfloat16 (16384,) %'all_gather.1'[8i0.128+1024T_i2+i1.8200] # id=48224, src_id=None, , attrs={'can_read_uninit': True}, instances=16 # dl = tensor_op_name: _add.383 | hlo_id: 383 | [[i0.128];[i1.8200]] -> [[i0.128];[i1.8200]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 67.046us (8.000MiB, est bw: 125.117GB/s, 0.545% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 4, 128) %'input5_local_39953'[i160_25349_0_42922,2i183_0_0+i183_0_1,i183_1_0_0,i183_1_0_1,i0.128,i2.4,i1.128] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 4, 2, 4, 128, 128) %'input5'[2i183_0_0+i183_0_1,i183_1_0_0,i183_1_0_1,0,i160_25349_0_42922,i2.4,i0.128,i1.128] # id=48193, src_id=None, , instances=64 # dl = tensor_op_name: _dot.354 | hlo_id: 13489 | [[i0.128];[i1.128, i2.4]] -> [[i0.128];[i1.128, i2.4]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 67.046us (8.000MiB, est bw: 125.117GB/s, 0.545% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 4, 128) %'input7_local_40064'[i359_25350_0_42996,2i382_0_0+i382_0_1,i382_1_0_0,i382_1_0_1,i0.128,i2.4,i1.128] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 4, 2, 4, 128, 128) %'input7'[2i382_0_0+i382_0_1,i382_1_0_0,i382_1_0_1,0,i359_25350_0_42996,i2.4,i0.128,i1.128] # id=48392, src_id=None, , instances=64 # dl = tensor_op_name: _dot.698 | hlo_id: 13600 | [[i0.128];[i1.128, i2.4]] -> [[i0.128];[i1.128, i2.4]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 67.046us (8.000MiB, est bw: 125.117GB/s, 0.545% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 4, 128) %'input9_local_40167'[i531_25351_0_43070,2i554_0_0+i554_0_1,i554_1_0_0,i554_1_0_1,i0.128,i2.4,i1.128] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 4, 2, 4, 128, 128) %'input9'[2i554_0_0+i554_0_1,i554_1_0_0,i554_1_0_1,0,i531_25351_0_43070,i2.4,i0.128,i1.128] # id=48587, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1042 | hlo_id: 13711 | [[i0.128];[i1.128, i2.4]] -> [[i0.128];[i1.128, i2.4]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 67.046us (8.000MiB, est bw: 125.117GB/s, 0.545% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 4, 128) %'input11_local_40270'[i703_25352_0_43144,2i726_0_0+i726_0_1,i726_1_0_0,i726_1_0_1,i0.128,i2.4,i1.128] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 4, 2, 4, 128, 128) %'input11'[2i726_0_0+i726_0_1,i726_1_0_0,i726_1_0_1,0,i703_25352_0_43144,i2.4,i0.128,i1.128] # id=48782, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1386 | hlo_id: 13822 | [[i0.128];[i1.128, i2.4]] -> [[i0.128];[i1.128, i2.4]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 67.046us (8.000MiB, est bw: 125.117GB/s, 0.545% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 4, 128) %'input13_local_40373'[i875_25353_0_43218,2i898_0_0+i898_0_1,i898_1_0_0,i898_1_0_1,i0.128,i2.4,i1.128] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 4, 2, 4, 128, 128) %'input13'[2i898_0_0+i898_0_1,i898_1_0_0,i898_1_0_1,0,i875_25353_0_43218,i2.4,i0.128,i1.128] # id=48977, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1730 | hlo_id: 13933 | [[i0.128];[i1.128, i2.4]] -> [[i0.128];[i1.128, i2.4]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 67.046us (8.000MiB, est bw: 125.117GB/s, 0.545% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 4, 128) %'input15_local_40476'[i1047_25354_0_43292,2i1070_0_0+i1070_0_1,i1070_1_0_0,i1070_1_0_1,i0.128,i2.4,i1.128] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 4, 2, 4, 128, 128) %'input15'[2i1070_0_0+i1070_0_1,i1070_1_0_0,i1070_1_0_1,0,i1047_25354_0_43292,i2.4,i0.128,i1.128] # id=49172, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2074 | hlo_id: 14044 | [[i0.128];[i1.128, i2.4]] -> [[i0.128];[i1.128, i2.4]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 67.046us (8.000MiB, est bw: 125.117GB/s, 0.545% of tot. time) for bfloat16<128 x 512> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 4, 128) %'input17_local_40579'[i1219_25355_0_43366,2i1242_0_0+i1242_0_1,i1242_1_0_0,i1242_1_0_1,i0.128,i2.4,i1.128] = load bfloat16<128 x 512> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 4, 2, 4, 128, 128) %'input17'[2i1242_0_0+i1242_0_1,i1242_1_0_0,i1242_1_0_1,0,i1219_25355_0_43366,i2.4,i0.128,i1.128] # id=49367, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2418 | hlo_id: 14155 | [[i0.128];[i1.128, i2.4]] -> [[i0.128];[i1.128, i2.4]] +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.081 seconds +2025-11-04T21:40:31Z INFO 8854 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.018 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.047 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.028 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 5004) %4(init=0.0)[i0.128,i1.4748] = load float32<128 x 4748> float32 (128, 4748) %6[i0.128,i1.4748] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 4748) %10[i0.128,i1.4748] = load float32<128 x 4748> float32 (8, 75968) %'inp'[i0.128,i1.4748] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 5.874% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.014 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.007 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.008 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:31Z INFO 8854 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.047 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.020 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 12.331% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 288) %4(init=0.0)[i0.128,i1.32] = load float32<128 x 32> float32 (128, 32) %6[i0.128,i1.32] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 32) %10[i0.128,i1.32] = load float32<128 x 32> float32 (8, 512) %'inp'[i0.128,i1.32] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8854 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8854 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 1.667 seconds +2025-11-04T21:40:32Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:32Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:33Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.861 seconds +2025-11-04T21:40:33Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.861 seconds +2025-11-04T21:40:33Z INFO 8854 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:40:33Z WARNING 8854 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 82.34 percent of all matmul computation +2025-11-04T21:40:33Z INFO 8854 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8854 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.132 seconds +2025-11-04T21:40:33Z INFO 8854 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.502 seconds +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.102 seconds +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.114 seconds +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.228 seconds +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:34Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.693 seconds +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.693 seconds +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.140 seconds +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.164 seconds +2025-11-04T21:40:35Z INFO 8854 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:37Z INFO 8854 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:37Z INFO 8854 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.894 seconds +2025-11-04T21:40:39Z INFO 8854 [Tensorizer]: BirCodeGen estimate #instances=88271 in sg0000 +2025-11-04T21:40:39Z INFO 8854 [Tensorizer]: IR signature: 7c28586a5a137f628c24fbc824dc528dcc2616146b60debb856356c2b5565785 for nc00/sg0000/TensorizerBIR +2025-11-04T21:40:39Z INFO 8854 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:41Z INFO 8854 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:41Z INFO 8854 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.861 seconds +2025-11-04T21:40:43Z INFO 8854 [Tensorizer]: BirCodeGen estimate #instances=88271 in sg0000 +2025-11-04T21:40:43Z INFO 8854 [Tensorizer]: IR signature: a59eb8338fed1b4401f7f787a24598c21991d6aabbb85a45c784f9e631fdf5ca for nc01/sg0000/TensorizerBIR +2025-11-04T21:40:43Z INFO 8854 [Tensorizer]: Weights total number of bytes: 2810120 +2025-11-04T21:40:43Z INFO 8854 [Tensorizer]: Successfully built model. +2025-11-04T21:40:43Z USER 8854 [root/Tensorizer/Tensorizer]: Tensorizer finished after 125.589 seconds +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: End tensorization +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input0 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input1 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input2 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input3 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input4 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input5 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input6 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input7 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input8 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input9 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input10 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input11 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input12 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input13 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input14 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input15 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input16 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input17 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input18 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input19 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input20 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input21 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input22 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input23 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input24 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input25 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input26 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input27 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input28 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input29 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input30 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input31 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input32 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input33 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input34 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input35 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input36 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input37 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input38 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input39 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input40 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input41 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input42 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input43 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input44 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input45 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input46 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input47 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input48 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input49 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input50 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input51 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input52 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input53 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input54 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input55 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input56 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input57 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input58 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input59 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input60 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input61 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input62 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input63 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input64 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input65 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input66 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input67 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input68 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input69 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input70 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input71 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input72 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input73 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input74 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input75 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input76 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input77 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input78 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input79 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input80 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input81 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input82 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input83 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input84 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input85 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input86 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input87 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input88 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input89 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input90 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input91 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input92 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input93 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input94 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input95 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input96 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input97 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input98 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input99 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input100 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input101 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input102 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input103 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input104 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input105 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input106 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input107 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input108 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input109 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input110 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input111 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input112 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input113 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input114 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input115 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input116 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input117 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input118 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input119 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input120 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input121 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input122 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input123 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input124 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input125 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input126 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input127 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input128 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input129 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input130 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input131 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input132 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input133 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input134 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input135 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input136 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input137 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input138 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input139 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input140 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input141 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input142 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input143 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input144 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input145 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input146 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input147 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input148 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input149 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input150 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input151 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input152 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input153 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input154 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input155 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input156 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input157 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input158 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input159 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input160 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input161 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input162 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input163 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input164 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input165 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input166 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input167 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input168 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input169 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input170 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input171 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input172 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input173 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input174 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input175 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input176 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input177 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input178 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input179 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input180 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input181 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input182 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input183 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input184 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input185 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input186 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input187 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input188 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input189 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input190 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input191 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input192 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input193 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input194 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input195 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input196 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input197 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input198 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input199 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input200 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input201 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input202 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input203 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input204 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input205 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input206 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input207 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input208 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input209 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input210 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input211 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input212 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input213 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input214 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input215 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input216 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input217 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input218 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input219 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input220 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input221 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input222 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input223 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input224 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input225 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input226 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input227 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input228 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input229 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input230 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input231 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input232 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input233 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input234 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input235 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input236 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input237 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input238 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input239 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input240 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input241 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input242 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input243 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input244 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input245 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input246 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input247 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input248 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input249 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input250 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input251 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input252 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input253 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input254 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input255 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input256 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input257 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input258 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input259 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input260 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input261 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input262 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input263 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input264 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input265 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input266 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input267 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input268 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input269 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input270 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input271 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input272 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input273 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input274 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input275 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input276 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input277 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input278 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input279 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input280 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input281 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input282 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input283 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input284 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input285 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input286 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input287 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input288 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input289 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input290 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input291 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input292 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input293 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input294 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input295 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input296 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input297 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input298 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input299 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input300 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input301 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input302 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input303 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input304 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input305 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input306 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input307 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input308 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input309 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input310 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input311 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input312 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input313 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input314 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input315 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input316 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input317 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input318 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input319 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input320 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input321 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input322 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input323 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input324 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input325 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input326 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input327 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input328 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input329 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input330 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input331 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input332 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input333 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input334 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input335 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input336 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input337 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input338 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input339 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input340 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input341 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input342 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input343 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input344 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input345 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input346 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input347 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input348 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input349 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input350 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input351 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input352 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input353 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input354 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input355 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input356 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input357 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input358 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input359 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input360 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input361 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input362 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input363 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input364 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input365 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input366 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input367 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input368 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input369 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Network input: input370 +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:43Z INFO 8854 [job.Frontend.0]: Job #0 finished +2025-11-04T21:40:43Z INFO 8854 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:40:43Z INFO 8854 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:40:43Z INFO 8854 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:40:43Z INFO 8854 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: BackendDriver has 2 states with 2 core LNC +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: BackendDriver: no partitions within VNC found. Switching to VNC + flat flow. +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: BackendDriver in_state.num_states 2 with 2 core LNC +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/log-neuron-cc.txt --vnc-nc-per-sengine 2 --link-subgraphs nc00/sg00,nc01/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --unified-backend-and-legacy-codegen --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels spill_reload,io,scalar_dynamic_offset,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:40:43Z INFO 8854 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:40:43Z INFO 9545 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:40:43Z INFO 9545 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:40:43Z INFO 9545 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:40:43Z INFO 9545 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:40:44Z INFO 9545 [BackendDriver]: Backend driver mtBackend: false numModules: 2 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi" +2025-11-04T21:40:44Z INFO 9545 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:40:44Z INFO 9545 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:40:44Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:44Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:44Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.002 seconds +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 416mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.002 seconds +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 416mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z WARNING 9545 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.363.63805}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:44Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.363.63805}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:44Z USER 9545 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.183 seconds +2025-11-04T21:40:44Z USER 9545 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.183 seconds +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 674mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 674mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:44Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.194 seconds +2025-11-04T21:40:44Z INFO 9545 [BackendPassManager]: curr_vmrss: 674mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z USER 9545 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:44Z INFO 9545 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:44Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.005 seconds +2025-11-04T21:40:44Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 674mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 12120 memory location(s), 2 block(s), and 10056 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:44Z USER 9545 [BackendPassManager]: subgraph_parallel_pass finished after 0.010 seconds +2025-11-04T21:40:44Z INFO 9545 [BackendPassManager]: curr_vmrss: 674mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:44Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12120 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:44Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:44Z USER 9545 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.002 seconds +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:44Z USER 9545 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.002 seconds +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 674mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 674mb, ru_maxrss: 920mb (delta=0mb) +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6060 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:44Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6060 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:44Z INFO 9545 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:44Z INFO 9545 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:44 2025 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:44 2025 + +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Total count: 75236 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Matmult: 58369 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: GenericCopy: 9265 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Load: 2252 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: TensorScalarPtr: 1774 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: TensorTensor: 1274 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Save: 338 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Memset: 247 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Select: 114 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Iota: 58 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: TensorReduce: 35 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: DMACopy: 10 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 10 +2025-11-04T21:40:46Z USER 9545 (nc01/sg00) [ModuleForkPass]: unroll finished after 1.845 seconds +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1660mb, ru_maxrss: 1660mb (delta=740mb) +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 43064 memory location(s), 1 block(s), and 75236 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=43064 blocks=1 instructions=75236 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:44 2025 + +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Total count: 76396 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Matmult: 58369 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: GenericCopy: 9265 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: TensorScalarPtr: 2334 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Load: 2252 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: TensorTensor: 1274 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Iota: 394 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Save: 378 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Memset: 247 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: DMACopy: 234 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Select: 114 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: TensorReduce: 35 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 234 +2025-11-04T21:40:46Z USER 9545 (nc00/sg00) [ModuleForkPass]: unroll finished after 1.948 seconds +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1387mb, ru_maxrss: 1660mb (delta=740mb) +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 43064 memory location(s), 1 block(s), and 76396 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=43064 blocks=1 instructions=76396 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:46Z USER 9545 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.239 seconds +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1387mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:46Z USER 9545 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.238 seconds +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1387mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:46Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 2.205 seconds +2025-11-04T21:40:46Z INFO 9545 [BackendPassManager]: curr_vmrss: 1387mb, ru_maxrss: 1660mb (delta=740mb) +2025-11-04T21:40:46Z USER 9545 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:46Z INFO 9545 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=37532 blocks=2 instructions=150032 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:46Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=37532 blocks=2 instructions=150032 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.008 seconds +2025-11-04T21:40:46Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1387mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37532 memory location(s), 2 block(s), and 150032 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:46Z USER 9545 [BackendPassManager]: subgraph_parallel_pass finished after 0.016 seconds +2025-11-04T21:40:46Z INFO 9545 [BackendPassManager]: curr_vmrss: 1387mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:46Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:46Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37532 blocks=2 instructions=150032 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:46Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47751_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47760_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47769_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47778_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47787_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47796_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47805_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47814_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47823_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47832_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47841_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47850_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47859_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47868_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47877_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47886_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47895_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47904_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47913_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47922_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47931_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47940_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47949_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47958_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47967_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47976_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47985_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t47994_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45152_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:46Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45157_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:46Z USER 9545 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.246 seconds +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:46Z USER 9545 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.250 seconds +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:46Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:46Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.261 seconds +2025-11-04T21:40:46Z INFO 9545 [BackendPassManager]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:46Z USER 9545 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:46Z INFO 9545 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=37532 blocks=2 instructions=150032 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:46Z USER 9545 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:47Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=37532 blocks=2 instructions=150032 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.025 seconds +2025-11-04T21:40:47Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37532 memory location(s), 2 block(s), and 150032 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:47Z USER 9545 [BackendPassManager]: subgraph_parallel_pass finished after 0.032 seconds +2025-11-04T21:40:47Z INFO 9545 [BackendPassManager]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:47Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37532 blocks=2 instructions=150032 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.017 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.019 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.009 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.014 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.057 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.009 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z WARNING 9545 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.089 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.011 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z WARNING 9545 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 5 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:47Z INFO 9545 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:47Z INFO 9545 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.001 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.022 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.021 seconds +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.066 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:40:47Z INFO 9545 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:47Z INFO 9545 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.002 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.03 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.034 seconds +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.092 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.287 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.017 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1392mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.042 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.380 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.113 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.020 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.011 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.009 seconds +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18038 memory location(s), 1 block(s), and 73647 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=18038 blocks=1 instructions=73647 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: To Spill 5 multi-layer tensors +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.056 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.177 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.013 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.011 seconds +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1393mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19494 memory location(s), 1 block(s), and 76385 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=19494 blocks=1 instructions=76385 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9545 (nc01/sg00) [build_flow_deps]: Allocs: 18050 instructions: 73659 +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: To Spill 6 multi-layer tensors +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:47Z INFO 9545 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:47Z INFO 9545 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 188364 edges +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [build_flow_deps]: Done build fdeps 188364 Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.602 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1401mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18050 memory location(s), 1 block(s), and 73659 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=18050 blocks=1 instructions=73659 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [build_flow_deps]: Allocs: 19508 instructions: 76395 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.197 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1405mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18049 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=18049 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.003 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1408mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18050 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=18050 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.003 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.009 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1412mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.009 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1412mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 206101 edges +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [build_flow_deps]: Done build fdeps 206101 Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: size = 9634 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: found 22581 edges +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: mean: 4.68777 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: median: 6.44977 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 180648 bytes +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: lo = 9560 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: total = 9634 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.253 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:48 2025 +2025-11-04T21:40:48Z USER 9545 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 0.846 seconds +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19508 memory location(s), 1 block(s), and 76395 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:48Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=19508 blocks=1 instructions=76395 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 31 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.125 seconds +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:48Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:48Z INFO 9545 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.244 seconds +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19477 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=19477 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19478 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=19478 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.010 seconds +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.010 seconds +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: size = 9806 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 210 PSUM Banks +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: found 22639 edges +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: mean: 4.61738 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: median: 6.41685 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 181112 bytes +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: lo = 9732 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: total = 9806 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 592 PSUM Banks +2025-11-04T21:40:49Z USER 9545 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.535 seconds +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1418mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:49Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.420 seconds +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1418mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1117420110 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4062 bytes +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1775616 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 421 bytes +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: size = 7841 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: found 9071 accumulation groups +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: largest = _dot.9689-t44936_i23 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.306 seconds +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1418mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:49Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:49Z INFO 9545 []: find first defs for local +2025-11-04T21:40:49Z INFO 9545 []: find first defs for global +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: 2230 remat count +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:49Z INFO 9545 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Num intervals 7841 Num locations 7841 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: edge: 126109 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: mean: 32.1666 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: median: 20.0244 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: safe = 7708 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: unsafe = 129 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: inf = 2 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: total = 7839 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 7841 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Total: 7839 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (7839) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Rover zone: 0.925 (7248) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.074 (583) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.001 (8) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.122 (955) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (1) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.758 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.000 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.000 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.878 (6883) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.632 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.999 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1117420110 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4062 bytes +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1775616 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 421 bytes +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:50Z USER 9545 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.675 seconds +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1423mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:50Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 317 PSUM Banks +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:50Z USER 9545 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.131 seconds +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1423mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18051 memory location(s), 1 block(s), and 73658 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:50Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=18051 blocks=1 instructions=73658 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1119195726, 97.886% input load, 0% output write, 2.11398% spill/reload [sg0000] +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 597 PSUM Banks +2025-11-04T21:40:50Z USER 9545 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.642 seconds +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1424mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:50Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1122519734 +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4068 bytes +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2807914 +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 558 bytes +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.09554e+09) +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload instructions +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload memory locations +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: size = 9089 +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: found 9243 accumulation groups +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: largest = _dot.9689-t44936_i10 +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 2048, 0.00865613% out of total spill/reload dma traffic +2025-11-04T21:40:50Z INFO 9545 []: find first defs for local +2025-11-04T21:40:50Z INFO 9545 []: find first defs for global +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: 2239 remat count +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:50Z INFO 9545 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:50Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Num intervals 9089 Num locations 9089 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: edge: 156285 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: mean: 34.3899 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: median: 21.9883 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: safe = 8926 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: unsafe = 159 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: inf = 2 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: total = 9087 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 9089 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Total: 9087 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (9087) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Rover zone: 0.901 (8188) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.065 (590) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.034 (305) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (4) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.109 (990) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.073 (660) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.296 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.334 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.920 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.818 (7437) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.566 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.570 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 4 SpillSaves and Reloads +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 4066 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 449 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 0 SpillSaves and Reloads +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 4066 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 449 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1117419086 +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4066 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1774592 +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 449 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 2048, 0.000182989% out of total dma traffic +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1119193678, 97.8862% input load, 0% output write, 2.1138% spill/reload [sg0000] +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1117419086 +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4066 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1774592 +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 449 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 16416 +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 7 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3981 bytes +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:51Z USER 9545 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 1.443 seconds +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1424mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73654 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:51Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=18044 blocks=1 instructions=73654 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1122519734 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4068 bytes +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2807914 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 558 bytes +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:51Z USER 9545 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 1.342 seconds +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1424mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:51Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:51Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 169 Sb address +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:51Z USER 9545 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.195 seconds +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1425mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19479 memory location(s), 1 block(s), and 76364 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:51Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:51Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=19479 blocks=1 instructions=76364 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1125327648, 97.5818% input load, 2.84362e-06% output write, 2.41822% spill/reload [sg0000] +2025-11-04T21:40:52Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1498 Sb address +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 65536, 0.00582373% out of total dma traffic(1.09811e+09) +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload instructions +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 2 spill/reload memory locations +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:40:52Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 126 Sb address +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 2048, 0.00752583% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:52Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1340 Sb address +2025-11-04T21:40:52Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 1.461 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1426mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73654 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=18044 blocks=1 instructions=73654 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: reserved space = 166144 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: spill space = 70656 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 77824 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: size = 4 +2025-11-04T21:40:53Z INFO 9545 []: find first defs for local +2025-11-04T21:40:53Z INFO 9545 []: find first defs for global +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 4 SpillSaves and Reloads +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 4074 bytes +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 588 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: Num intervals 4 Num locations 4 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: lo = 4 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: total = 4 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 77824 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.272 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1426mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73654 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=18044 blocks=1 instructions=73654 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 74752 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 0 SpillSaves and Reloads +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 4074 bytes +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 588 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1122453174 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4074 bytes +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2806890 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 588 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 74752 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.146 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73654 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=18044 blocks=1 instructions=73654 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 1741 out of 9225 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.026 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73654 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=18044 blocks=1 instructions=73654 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.047 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 73768, number of allocs: 18044 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.01782 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.011 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.010 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.010 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 67584, 0.00600572% out of total dma traffic +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1125260064, 97.5818% input load, 2.84379e-06% output write, 2.41819% spill/reload [sg0000] +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1122453174 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4074 bytes +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2806890 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 588 bytes +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 482400 +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 85 bytes +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 3936 bytes +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:53Z USER 9545 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 1.758 seconds +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76359 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:53Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=19471 blocks=1 instructions=76359 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.077 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.016 seconds +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1427mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:53Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47751_i1}@SB<0,23696>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47760_i1}@SB<0,26256>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47769_i1}@SB<0,22920>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47778_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47787_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47796_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47805_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47814_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47823_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47832_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47841_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47850_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47859_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47868_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47877_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47886_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47895_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47904_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47913_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47922_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47931_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47940_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47949_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47958_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47967_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47976_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47985_i1}@SB<0,19272>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t47994_i1}@SB<0,16800>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45152_i1}@SB<32,16552>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:53Z WARNING 9545 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45157_i1}@SB<96,17672>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:53Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 189 Sb address +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.208 seconds +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1428mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.015 seconds +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1428mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [build_flow_deps]: Allocs: 18044 instructions: 73768 +2025-11-04T21:40:54Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1387 Sb address +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 188586 edges +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [build_flow_deps]: Done build fdeps 188586 Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.298 seconds +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1445mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.052 seconds +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1445mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:54Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:55Z USER 9545 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.838 seconds +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1326mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:55Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:55Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 243 Sb address +2025-11-04T21:40:55Z USER 9545 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.177 seconds +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1326mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:55Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=18044 blocks=1 instructions=73768 Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:55Z USER 9545 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.072 seconds +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1326mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18044 memory location(s), 1 block(s), and 73768 instruction(s). Max writers: 298 Max Readers: 13901 +2025-11-04T21:40:56Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 95 Sb address +2025-11-04T21:40:56Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1541 Sb address +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 3.342 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1326mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76359 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=19471 blocks=1 instructions=76359 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: reserved space = 166152 bytes +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: spill space = 103488 bytes +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 118784 bytes +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: size = 10 +2025-11-04T21:40:57Z INFO 9545 []: find first defs for local +2025-11-04T21:40:57Z INFO 9545 []: find first defs for global +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: Num intervals 10 Num locations 10 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: lo = 10 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: total = 10 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 81920 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.300 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1326mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76359 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=19471 blocks=1 instructions=76359 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 79872 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 79872 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.158 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76359 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=19471 blocks=1 instructions=76359 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 1853 out of 9470 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.028 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76359 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=19471 blocks=1 instructions=76359 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.050 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 76473, number of allocs: 19471 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.009728 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.012 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.011 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.012 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.086 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.016 seconds +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:57Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:57Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.211 seconds +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.032 seconds +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1327mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:40:58 2025 +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [build_flow_deps]: Allocs: 19471 instructions: 76473 +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 206293 edges +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [build_flow_deps]: Done build fdeps 206293 Tue Nov 4 21:40:58 2025 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.383 seconds +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1349mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.058 seconds +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1349mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:40:58Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:40:59Z USER 9545 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.618 seconds +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:59Z USER 9545 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.183 seconds +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=19471 blocks=1 instructions=76473 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.086 seconds +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19471 memory location(s), 1 block(s), and 76473 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:59Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 12.385 seconds +2025-11-04T21:40:59Z INFO 9545 [BackendPassManager]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z USER 9545 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:59Z INFO 9545 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=37515 blocks=2 instructions=150241 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=37515 blocks=2 instructions=150241 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.007 seconds +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37515 memory location(s), 2 block(s), and 150241 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=37515 blocks=2 instructions=150241 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.078 seconds +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 151155 instruction(s). Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=37913 blocks=2 instructions=151155 Max writers: 298 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.258 seconds +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 151159 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:59Z USER 9545 [BackendPassManager]: subgraph_parallel_pass finished after 0.366 seconds +2025-11-04T21:40:59Z INFO 9545 [BackendPassManager]: curr_vmrss: 1411mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:59Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151159 Max writers: 299 Max Readers: 14685 +2025-11-04T21:40:59Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:59Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=18243 blocks=1 instructions=74227 Max writers: 299 Max Readers: 13901 +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: reserved space = 269640 bytes +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:59Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: size = 132 +2025-11-04T21:40:59Z INFO 9545 []: find first defs for local +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: reserved space = 236800 bytes +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:40:59Z USER 9545 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.140 seconds +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1415mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74227 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:40:59Z INFO 9545 []: find first defs for global +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: Num intervals 132 Num locations 132 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: lo = 132 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: total = 132 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 81920 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 81920 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 3850240 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 3850240 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 6303744 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:41:00Z USER 9545 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.304 seconds +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1416mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.312 seconds +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: curr_vmrss: 1416mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151159 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:41:00Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=37913 blocks=2 instructions=151159 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.006 seconds +2025-11-04T21:41:00Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1416mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 151159 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: subgraph_parallel_pass finished after 0.013 seconds +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: curr_vmrss: 1416mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151159 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=18243 blocks=1 instructions=74227 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:00Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:00Z USER 9545 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.078 seconds +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1416mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z USER 9545 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.080 seconds +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1416mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74227 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.087 seconds +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: curr_vmrss: 1416mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151159 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:41:00Z USER 9545 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:41:00Z INFO 9545 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z INFO 9545 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=18243 blocks=1 instructions=74227 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:00Z USER 9545 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.354 seconds +2025-11-04T21:41:00Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.369 seconds +2025-11-04T21:41:00Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74227 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:00Z USER 9545 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: nc_parallel_pass finished after 0.408 seconds +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:00Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151159 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:41:00Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=18243 blocks=1 instructions=74227 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:00Z USER 9545 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.002 seconds +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74227 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:00Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.007 seconds +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1417mb, ru_maxrss: 1660mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=18243 blocks=1 instructions=74227 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:00Z INFO 9545 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:41:00Z INFO 9545 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:41:00Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:00Z INFO 9545 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:41:00Z INFO 9545 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:41:00Z INFO 9545 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:41:00 2025 +2025-11-04T21:41:00Z INFO 9545 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:41:00 2025 +2025-11-04T21:41:01Z INFO 9545 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:41:01Z INFO 9545 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:41:03Z INFO 9545 [post_scheduler]: Time-aware simulation time: 6835126 +2025-11-04T21:41:03Z INFO 9545 [post_scheduler]: Time-aware simulation time: 6425144 +2025-11-04T21:41:04Z INFO 9545 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:04 2025 +2025-11-04T21:41:04Z USER 9545 (nc00/sg00) [ModuleForkPass]: post_sched finished after 3.507 seconds +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1685mb, ru_maxrss: 1685mb (delta=25mb) +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.013 seconds +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1525mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z INFO 9545 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:04 2025 +2025-11-04T21:41:04Z USER 9545 (nc01/sg00) [ModuleForkPass]: post_sched finished after 3.627 seconds +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1526mb, ru_maxrss: 1685mb (delta=25mb) +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74227 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:04Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=18243 blocks=1 instructions=74227 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:04Z USER 9545 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.014 seconds +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1507mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74227 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:04Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=18243 blocks=1 instructions=74227 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:04Z USER 9545 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.108 seconds +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1508mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.150 seconds +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1508mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74195 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:04Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:04Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 3.829 seconds +2025-11-04T21:41:04Z INFO 9545 [BackendPassManager]: curr_vmrss: 1508mb, ru_maxrss: 1685mb (delta=25mb) +2025-11-04T21:41:04Z USER 9545 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:04Z INFO 9545 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151127 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:41:04Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=37913 blocks=2 instructions=151127 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.028 seconds +2025-11-04T21:41:04Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1508mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 151127 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:04Z USER 9545 [BackendPassManager]: subgraph_parallel_pass finished after 0.057 seconds +2025-11-04T21:41:04Z INFO 9545 [BackendPassManager]: curr_vmrss: 1508mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:04Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:04Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151127 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:04Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:04Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:04Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=18243 blocks=1 instructions=74195 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:04Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:05Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 5356 PSUM Banks +2025-11-04T21:41:05Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 5428 PSUM Banks +2025-11-04T21:41:05Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 288 PSUM Banks +2025-11-04T21:41:05Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 296 PSUM Banks +2025-11-04T21:41:05Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4305 PSUM Banks +2025-11-04T21:41:05Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 4392 PSUM Banks +2025-11-04T21:41:05Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 10 Sb address +2025-11-04T21:41:05Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 42 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 37 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 14 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 71 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 163 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 108 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1194 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: moved 9 MM forward +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1392 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: moved 21 MM forward +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:06Z USER 9545 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.390 seconds +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1516mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74195 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:06Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=18243 blocks=1 instructions=74195 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:06Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 27 Sb address +2025-11-04T21:41:06Z USER 9545 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.489 seconds +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1533mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:06Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:06Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:07Z USER 9545 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.810 seconds +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1684mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74195 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:07Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=18243 blocks=1 instructions=74195 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:07Z USER 9545 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.797 seconds +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1569mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:07Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:07Z USER 9545 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.092 seconds +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1485mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74195 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:07Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=18243 blocks=1 instructions=74195 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:41:07 2025 +2025-11-04T21:41:07Z INFO 9545 (nc01/sg00) [build_flow_deps]: Allocs: 18243 instructions: 74195 +2025-11-04T21:41:07Z USER 9545 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.106 seconds +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1489mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:07Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:41:07 2025 +2025-11-04T21:41:07Z INFO 9545 (nc00/sg00) [build_flow_deps]: Allocs: 19670 instructions: 76932 +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 192180 edges +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [build_flow_deps]: Done build fdeps 192180 Tue Nov 4 21:41:08 2025 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 210611 edges +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [build_flow_deps]: Done build fdeps 210611 Tue Nov 4 21:41:08 2025 +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.369 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74195 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=18243 blocks=1 instructions=74195 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬───────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼───────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 177 │ 131072 │ +│ Load │ Const -> Internal │ 5 │ 165120 │ +│ Load │ ExternalInput -> Internal │ 2149 │ 1095370816 │ +│ Load │ Internal │ 69 │ 1935366 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 337 │ 1774592 │ +└──────────────┴───────────────────────────┴───────┴────────────┘ + +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 66 │ +│ 8 │ 5 │ +│ 16 │ 6 │ +│ 32 │ 60 │ +│ 64 │ 3 │ +│ 88 │ 3 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 304 │ +│ 1024 │ 898 │ +│ 2048 │ 86 │ +│ 4096 │ 297 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 616 │ +│ 16400 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 57521 #MatMult-Transposes 13944 +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ReportStats]: IO Tensor size combined: 5789993152 +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60747_i1 │ Internal │ bfloat16 │ 3153920 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ all_gather.1_nostride_60851_i10 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i12 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i9 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i11 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i14 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i13 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i8 │ Internal │ bfloat16 │ 2099200 │ +└─────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.031 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74195 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.380 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 208 │ 139264 │ +│ DMACopy │ Internal -> ExternalOutput │ 224 │ 7516192768 │ +│ Load │ Const -> Internal │ 10 │ 2678024 │ +│ Load │ ExternalInput -> Internal │ 2150 │ 1095370848 │ +│ Load │ Internal │ 81 │ 4456518 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 382 │ 2806858 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 32 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 74 │ +│ 8 │ 6 │ +│ 16 │ 6 │ +│ 32 │ 61 │ +│ 64 │ 7 │ +│ 88 │ 3 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 305 │ +│ 1024 │ 913 │ +│ 2048 │ 86 │ +│ 4096 │ 325 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 616 │ +│ 16384 │ 2 │ +│ 16400 │ 8 │ +│ 18992 │ 2 │ +│ 1048576 │ 224 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 58365 #MatMult-Transposes 14728 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ReportStats]: IO Tensor size combined: 5789993152 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60747_i0 │ Internal │ bfloat16 │ 3153920 │ +│ -t80228 │ Internal │ float32 │ 2562048 │ +│ -t80222 │ Internal │ float32 │ 2562048 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ -t80225 │ Internal │ float32 │ 2430976 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ all_gather.1_nostride_60851_i3 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i2 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i1 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60851_i0 │ Internal │ bfloat16 │ 2099200 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.030 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76932 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 3.843 seconds +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Inputs to assign_trigger_engine: modules=2 functions=2 allocs=37913 blocks=2 instructions=151127 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 453 DMA instructions. Moved 71 DMA instructions to CC's engines. +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 407 DMA instructions. Moved 70 DMA instructions to CC's engines. +2025-11-04T21:41:08Z INFO 9545 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: assign_trigger_engine finished after 0.088 seconds +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 151127 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151127 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=18243 blocks=1 instructions=74195 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=19670 blocks=1 instructions=76932 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.024 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.024 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.030 seconds +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=2 functions=2 allocs=37913 blocks=2 instructions=151245 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: assign_hwdge_engine finished after 0.025 seconds +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 151245 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151245 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 8 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 226 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 320 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 7 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 25 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2155 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 6 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.014 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 19 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 260 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 285 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 80 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 32 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2380 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 8 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.014 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.023 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.023 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.045 seconds +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151245 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:08Z USER 9545 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:08Z INFO 9545 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.001 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z USER 9545 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.002 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z USER 9545 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: nc_parallel_pass finished after 0.007 seconds +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151245 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:08Z USER 9545 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.137 seconds +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.140 seconds +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.148 seconds +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: curr_vmrss: 1492mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:08Z USER 9545 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:08Z INFO 9545 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=151245 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z USER 9545 (nc01) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:08Z USER 9545 (nc00) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:08Z INFO 9545 (nc00) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:08Z INFO 9545 (nc01) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:08Z INFO 9545 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:08Z INFO 9545 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 61756 +2025-11-04T21:41:09Z INFO 9545 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 72260 +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 65459 +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 65459 +2025-11-04T21:41:09Z INFO 9545 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 77016 +2025-11-04T21:41:09Z INFO 9545 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 77016 +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [DepReduction]: Finished dependency reduction: 465641 removed, new total 23074 +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: dep_reduction finished after 1.227 seconds +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1607mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.016 seconds +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1601mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.048 seconds +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1612mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:09Z INFO 9545 (nc01/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.012 seconds +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1617mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74254 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:09Z USER 9545 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:09Z INFO 9545 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=18243 blocks=1 instructions=74254 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 2032/2032 (100% DGE) + power-of-2 partition : 2032/2037 (99.7545% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2033/2038 (99.7547% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 31/31 (100% DGE) + power-of-2 partition : 31/423 (7.32861% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 31/423 (7.32861% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 169 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 9/9 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: lower_dma finished after 0.062 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1628mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74255 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=18243 blocks=1 instructions=74255 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: expand_all_engine finished after 0.016 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1616mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74255 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=18243 blocks=1 instructions=74255 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [DepReduction]: Finished dependency reduction: 508850 removed, new total 24706 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: dep_reduction finished after 1.414 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.019 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.047 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.012 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76991 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=19670 blocks=1 instructions=76991 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.108 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74255 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=18243 blocks=1 instructions=74255 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 2032/2032 (100% DGE) + power-of-2 partition : 2032/2039 (99.6567% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2033/2040 (99.6569% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 40/40 (100% DGE) + power-of-2 partition : 40/485 (8.24742% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 40/485 (8.24742% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 197 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 234/234 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: lower_dma finished after 0.065 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76993 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=19670 blocks=1 instructions=76993 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: expand_all_engine finished after 0.017 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76993 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=19670 blocks=1 instructions=76993 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: expand_inst_late finished after 0.124 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74264 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=18243 blocks=1 instructions=74264 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [SeqInstOpt]: Removing 7 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.013 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 74257 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=18243 blocks=1 instructions=74257 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: lower_sync finished after 0.034 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 77484 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=18243 blocks=1 instructions=77484 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.109 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76993 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=19670 blocks=1 instructions=76993 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: lower_act finished after 0.015 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1614mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 77625 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=18243 blocks=1 instructions=77625 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: expand_inst_late finished after 0.129 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1631mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 77227 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=19670 blocks=1 instructions=77227 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [SeqInstOpt]: Removing 230 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.014 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1636mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 76997 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=19670 blocks=1 instructions=76997 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: lower_sync finished after 0.037 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1638mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 80714 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=19670 blocks=1 instructions=80714 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: lower_act finished after 0.016 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1639mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 80856 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=19670 blocks=1 instructions=80856 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: lower_dve finished after 0.221 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1641mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 77625 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=18243 blocks=1 instructions=77625 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: lower_ap finished after 0.017 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1617mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 77625 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=18243 blocks=1 instructions=77625 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: size = 2 +2025-11-04T21:41:10Z INFO 9545 []: find first defs for local reg +2025-11-04T21:41:10Z INFO 9545 []: find first defs for global reg +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: lower_dve finished after 0.243 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1662mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 80856 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=19670 blocks=1 instructions=80856 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: lower_ap finished after 0.018 seconds +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1631mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 80856 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z USER 9545 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:10Z INFO 9545 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=19670 blocks=1 instructions=80856 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: lo = 2 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: total = 2 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:10Z INFO 9545 (nc01/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:10Z USER 9545 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.234 seconds +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: curr_vmrss: 1631mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9545 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 77625 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:10Z INFO 9545 (nc00/sg00) [REG_Allocator]: size = 4 +2025-11-04T21:41:10Z INFO 9545 []: find first defs for local reg +2025-11-04T21:41:10Z INFO 9545 []: find first defs for global reg +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: lo = 4 +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: total = 4 +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:11Z USER 9545 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.260 seconds +2025-11-04T21:41:11Z INFO 9545 (nc00) [CoreForkPass]: curr_vmrss: 1632mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z INFO 9545 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 80856 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: nc_parallel_pass finished after 2.496 seconds +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: curr_vmrss: 1598mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: vnc_remote_addr_map finished after 0.010 seconds +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: curr_vmrss: 1557mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 158481 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: Running vnc_link +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z INFO 9545 [VncLink]: Found 0 remote updates +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: vnc_link finished after 0.002 seconds +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: curr_vmrss: 1557mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 158481 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:11Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=19670 blocks=1 instructions=80856 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=18243 blocks=1 instructions=77625 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:11Z USER 9545 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.174 seconds +2025-11-04T21:41:11Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1566mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z USER 9545 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.180 seconds +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1550mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 77625 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 80856 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 0.185 seconds +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: curr_vmrss: 1549mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:41:11Z INFO 9545 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.032 seconds +2025-11-04T21:41:11Z INFO 9545 (sg00) [SubgraphForkPass]: curr_vmrss: 1550mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z INFO 9545 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 158481 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: subgraph_parallel_pass finished after 0.037 seconds +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: curr_vmrss: 1549mb, ru_maxrss: 1685mb (delta=0mb) +2025-11-04T21:41:11Z USER 9545 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:11Z INFO 9545 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z USER 9545 (nc00/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:11Z USER 9545 (nc01/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=19670 blocks=1 instructions=80856 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:11Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=18243 blocks=1 instructions=77625 Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:11Z INFO 9545 (nc01/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:11Z INFO 9545 (nc01/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64235 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249505 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:11Z INFO 9545 (nc00/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64235 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249506 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 58714 │ +│ LDWEIGHTS │ 58714 │ +│ CAST │ 7150 │ +│ EVENT_SEMAPHORE │ 3227 │ +│ UNKNOWN(0xd4) │ 2186 │ +│ ACTIVATE │ 2120 │ +│ COPY │ 1960 │ +│ TENSOR_TENSOR │ 1273 │ +│ UNKNOWN(0xd8) │ 589 │ +│ PSEUDO_DMA_TRIGGER │ 567 │ +│ TENSOR_SCALAR │ 261 │ +│ MEMSET │ 229 │ +│ ACT_TABLE_LOAD │ 141 │ +│ UNKNOWN(0xe8) │ 114 │ +│ TENSOR_SCALAR_ADDR │ 113 │ +│ UNKNOWN(0xda) │ 68 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 57 │ +│ TENSOR_REDUCE │ 30 │ +│ STREAM_SHUFFLE │ 24 │ +│ LOAD_MASK_SELECT │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 2 │ +│ IOTA │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 4531 │ +│ Scalar │ 10049 │ +│ Tensor │ 118888 │ +│ SyncDMA │ 0 │ +│ Vector │ 4117 │ +│ Sync │ 48 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 59618 │ +│ LDWEIGHTS │ 59618 │ +│ CAST │ 7150 │ +│ EVENT_SEMAPHORE │ 3717 │ +│ UNKNOWN(0xd4) │ 2420 │ +│ COPY │ 2194 │ +│ ACTIVATE │ 2127 │ +│ TENSOR_TENSOR │ 1274 │ +│ TENSOR_SCALAR_ADDR │ 674 │ +│ PSEUDO_DMA_TRIGGER │ 652 │ +│ UNKNOWN(0xd8) │ 589 │ +│ IOTA │ 394 │ +│ UNKNOWN(0xda) │ 293 │ +│ TENSOR_SCALAR │ 263 │ +│ MEMSET │ 241 │ +│ GATHER │ 240 │ +│ POOL_BUFFER_LOAD │ 240 │ +│ ACT_TABLE_LOAD │ 142 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ UNKNOWN(0xe8) │ 114 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 59 │ +│ TENSOR_REDUCE │ 35 │ +│ LOAD_MASK_SELECT │ 25 │ +│ STREAM_SHUFFLE │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 4 │ +│ UNKNOWN(0xe5) │ 2 │ +│ STREAM_TRANSPOSE │ 1 │ +│ NOP │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 6832 │ +│ Scalar │ 10210 │ +│ Tensor │ 120676 │ +│ SyncDMA │ 0 │ +│ Vector │ 4897 │ +│ Sync │ 79 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:12Z USER 9545 (nc01/sg00) [Codegen]: isa_gen finished after 0.760 seconds +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 5232 │ +│ qDVESpillReload0 │ 112 │ +│ qPoolSpillReload0 │ 39513 │ +│ qSPIO0 │ 33 │ +│ qSPSpillReload0 │ 196 │ +└───────────────────┴────────────────┘ + +Total descriptors: 45086 (0.000671834 GB) +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [Codegen]: Tensors with largest descriptor count: +┌─────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ reduce.2673.61564_i0_remote_0 │ Internal │ float32 │ 1 │ +│ dot.110.54321 │ Internal │ float32 │ 1 │ +│ all_reduce.59-buffer-80015 │ Internal │ bfloat16 │ 1 │ +│ _reduce.8521-t47962_i1 │ Internal │ float32 │ 1 │ +│ _reduce.8236-t57233_i0_remote_0 │ Internal │ float32 │ 1 │ +│ 38660.56441_i356 │ Internal │ float32 │ 1 │ +│ _dot.2247-t47797_i1 │ Internal │ bfloat16 │ 1 │ +│ split_4 │ Internal │ float32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└─────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:12Z USER 9545 (nc01/sg00) [Codegen]: dma_desc_gen finished after 0.010 seconds +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:12Z USER 9545 (nc00/sg00) [Codegen]: isa_gen finished after 0.771 seconds +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 4644 │ +│ qDVESpillReload0 │ 1252 │ +│ qPoolSpillReload0 │ 50340 │ +│ qSPIO0 │ 51 │ +│ qSPSpillReload0 │ 330 │ +└───────────────────┴────────────────┘ + +Total descriptors: 56617 (0.000843659 GB) +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [Codegen]: Tensors with largest descriptor count: +┌─────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────┼───────────────┼──────────┼──────────────────┤ +│ get_tuple_element.2 │ Internal │ uint32 │ 2 │ +│ split_5 │ Internal │ float32 │ 2 │ +│ all_gather.2 │ Internal │ float32 │ 2 │ +│ get_tuple_element.6 │ Internal │ float32 │ 2 │ +│ input2 │ ExternalInput │ int32 │ 2 │ +│ get_tuple_element.1 │ Internal │ float32 │ 2 │ +│ get_tuple_element.5 │ Internal │ float32 │ 2 │ +│ custom_call.143 │ Internal │ float32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└─────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:12Z USER 9545 (nc00/sg00) [Codegen]: dma_desc_gen finished after 0.011 seconds +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:12Z WARNING 9545 (nc01/sg00) [Codegen]: Found 262 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:12Z USER 9545 (nc01/sg00) [Codegen]: debug_info_gen finished after 0.161 seconds +2025-11-04T21:41:12Z WARNING 9545 (nc00/sg00) [Codegen]: Found 274 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:12Z USER 9545 (nc00/sg00) [Codegen]: debug_info_gen finished after 0.169 seconds +2025-11-04T21:41:12Z USER 9545 (nc01/sg00) [ModuleForkPass]: codegen finished after 0.957 seconds +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1718mb, ru_maxrss: 1719mb (delta=34mb) +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 18243 memory location(s), 1 block(s), and 77625 instruction(s). Max writers: 299 Max Readers: 13901 +2025-11-04T21:41:12Z USER 9545 (nc00/sg00) [ModuleForkPass]: codegen finished after 0.982 seconds +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1661mb, ru_maxrss: 1719mb (delta=34mb) +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 19670 memory location(s), 1 block(s), and 80856 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:12Z USER 9545 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:12Z USER 9545 [BackendPassManager]: mod_parallel_pass finished after 1.002 seconds +2025-11-04T21:41:12Z INFO 9545 [BackendPassManager]: curr_vmrss: 1554mb, ru_maxrss: 1719mb (delta=34mb) +2025-11-04T21:41:12Z USER 9545 [BackendPassManager]: Running hbm_usage +2025-11-04T21:41:12Z INFO 9545 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 480.000B │ 126.156KB │ +│ CCE │ 0.000B │ 674.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 2.000KB │ 161.000KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc00/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.660GB │ +│ Model Code │ 8.709MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.012MB │ +│ DMA Ring IO │ 2.469KB │ +│ DMA Ring Spill │ 961.828KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 416.000B │ 114.594KB │ +│ CCE │ 0.000B │ 506.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 1.500KB │ 140.250KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:12Z INFO 9545 (nc01/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.660GB │ +│ Model Code │ 8.400MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.012MB │ +│ DMA Ring IO │ 1.906KB │ +│ DMA Ring Spill │ 761.516KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:12Z INFO 9545 [HBMUsage]: Total estimated HBM usage is: 3.675GB +2025-11-04T21:41:12Z USER 9545 [BackendPassManager]: hbm_usage finished after 0.010 seconds +2025-11-04T21:41:12Z INFO 9545 [BackendPassManager]: curr_vmrss: 1532mb, ru_maxrss: 1719mb (delta=0mb) +2025-11-04T21:41:12Z INFO 9545 [BackendPassManager]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 158481 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:12Z USER 9545 [BackendPassManager]: Running neff_packager +2025-11-04T21:41:12Z INFO 9545 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=2 allocs=37913 blocks=2 instructions=158481 Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:12Z WARNING 9545 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi/metrics.json +2025-11-04T21:41:12Z WARNING 9545 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:41:12Z INFO 9545 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff +2025-11-04T21:41:12Z INFO 9545 [NeffFileWriter]: IR signature: dcbe023a64b2752c4661c5bec5cc2fae for neff artifacts +2025-11-04T21:41:12Z USER 9545 [BackendPassManager]: neff_packager finished after 0.328 seconds +2025-11-04T21:41:12Z INFO 9545 [BackendPassManager]: curr_vmrss: 1533mb, ru_maxrss: 1719mb (delta=0mb) +2025-11-04T21:41:12Z INFO 9545 [BackendPassManager]: Output has 2 module(s), 2 function(s), 37913 memory location(s), 2 block(s), and 158481 instruction(s). Max writers: 299 Max Readers: 14685 +2025-11-04T21:41:12Z INFO 9545 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000076 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.005871 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000111 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.005882 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000076 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.005871 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000072 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000072 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000072 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.005871 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:41:12Z INFO 9545 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=local (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_6 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.003906 MB │ +│ split_2 │ uint8 │ 1 │ 0.003906 MB │ +│ split_4 │ int32 │ 1 │ 0.003906 MB │ +│ split_5 │ float32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:12Z INFO 9545 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.656 │ float32 │ 1 │ 2.320312 MB │ +│ all_reduce.111 │ bfloat16 │ 1 │ 0.031250 MB │ +│ get_tuple_element.1 │ float32 │ 1 │ 0.007812 MB │ +│ get_tuple_element.2 │ uint32 │ 1 │ 0.007812 MB │ +│ all_reduce.112 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:12Z INFO 9545 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg00, addr_space=local (complete data located at nc01/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_5 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.003906 MB │ +│ split_2 │ uint8 │ 1 │ 0.003906 MB │ +│ split_4 │ float32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:12Z INFO 9545 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:41:13Z INFO 8854 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:41:13Z INFO 8854 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:41:13Z INFO 8854 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi +2025-11-04T21:41:13Z INFO 8854 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:41:13Z INFO 8854 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:41:13Z INFO 8854 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:41:13Z INFO 8854 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:41:13Z INFO 8854 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:41:13Z INFO 8854 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi/hlo_netlist.json +2025-11-04T21:41:13Z INFO 8854 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk3/neuronxcc-yv9014yi/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:41:13Z INFO 8854 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:41:13Z INFO 8854 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:41:13Z INFO 8806 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk3/metaneff.pb b/token_generation_model/_tp0_bk3/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..293f93b9d359efe42e0caca14b56a8605a76432e --- /dev/null +++ b/token_generation_model/_tp0_bk3/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad000b7e61891da5d3b104ddde14fd7092fc0da3e1d496a40ea2c98590e66220 +size 3988817 diff --git a/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb b/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..a3b10f7ea1f8a1d9dfd0b9a53ad33fc0a7adbd1e --- /dev/null +++ b/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:888f9de5403c40b142830f985d19146aea7b7b770116023d920e0a8e22ac3ddb +size 4075105 diff --git a/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff b/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff new file mode 100644 index 0000000000000000000000000000000000000000..3362b302960e9ba17a300c5cdea01194e6ec2f0a --- /dev/null +++ b/token_generation_model/_tp0_bk3/model.MODULE_8aa2bc135acfce1f4a61+bd0ab490.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7725ff2a45b5375e7597bb8ce00907e416a0f70c0e8323f6633b51ad821fe52e +size 7465984 diff --git a/token_generation_model/_tp0_bk3/neuron_config.json b/token_generation_model/_tp0_bk3/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..32b4bf94e7d6b700189048c4d971b7c9dee1da2a --- /dev/null +++ b/token_generation_model/_tp0_bk3/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 8, + "bucket_n_active_tokens": false, + "buckets": [ + 1024 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": [ + 1024 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk4/command.txt b/token_generation_model/_tp0_bk4/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a06f40a87475364663184c9619663791d7c651a --- /dev/null +++ b/token_generation_model/_tp0_bk4/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb --output model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk4/compile_flags.MODULE_ec05e5a8222761962028+3b7d8ecf.json b/token_generation_model/_tp0_bk4/compile_flags.MODULE_ec05e5a8222761962028+3b7d8ecf.json new file mode 100644 index 0000000000000000000000000000000000000000..01ce584722083f58783fdb5bfa1c06a851d25e18 --- /dev/null +++ b/token_generation_model/_tp0_bk4/compile_flags.MODULE_ec05e5a8222761962028+3b7d8ecf.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=2", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk4/global_metric_store.json b/token_generation_model/_tp0_bk4/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..2877fe5d4b078f97b9116dfa9b40a9ea42f9b4ba --- /dev/null +++ b/token_generation_model/_tp0_bk4/global_metric_store.json @@ -0,0 +1,590 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.57540893554688, + "StaticProfiler::AveragePartitionUtilization": 88.06752014160156, + "StaticProfiler::AveragePeUtilization": 72.09053802490234, + "StaticProfiler::LocalizationEfficiency": 212.57244873046875, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 217.00836181640625, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 2.5843513011932373, + "AffinePredicateResolution": 0.09803915023803711, + "AliasDependencyElimination": 0.003720521926879883, + "AliasDependencyInduction": 0.7286062240600586, + "AliasDependencyReset": 0.7808692455291748, + "BFComputeCutting": 0.06884360313415527, + "BirCodeGenLoop": 2.4264121055603027, + "CCOpFusion": 0.7197604179382324, + "CanonicalizeConv": 0.00019299999985378236, + "CanonicalizeDAGForPGTiling": 0.16963481903076172, + "CanonicalizeForTensorizer": 0.00042100000428035855, + "CanonicalizeIR": 0.10015439987182617, + "Canonicalizer": 0.009693999774754047, + "CoalesceCCOp": 0.17603373527526855, + "CommuteConcat": 0.03352236747741699, + "DMALocalityOpt": 0.04418063163757324, + "DMAProfiler": 0.08575034141540527, + "DMATilingProfiler": 0.09742927551269531, + "DataLocalityOpt": 3.2403125762939453, + "DataStreaming": 0.17079639434814453, + "DeConcat": 0.06206154823303223, + "DeadCodeElimination": 0.0335233211517334, + "DeadStoreElimination": 1.1721632480621338, + "DelinearIndices": 0.4310429096221924, + "Delinearization": 0.1684703826904297, + "DelinearizeSPMD": 0.21332669258117676, + "DoNothing": 0.0003631114959716797, + "DramToDramTranspose": 0.3430459499359131, + "DumpGraphAndMetadata": 0.14450669288635254, + "EliminateDivs": 0.1906135082244873, + "ExpandBatchNorm": 0.09135627746582031, + "ExpandISAMacro": 0.09242129325866699, + "FactorizeBlkDims": 0.6308884620666504, + "FactorizeThreadAxesInFreeDims": 0.08972764015197754, + "FlattenMacroLoop": 0.10975432395935059, + "GenericAccessSimplifier": 0.028026580810546875, + "HoistCompute": 8.600000001024455e-05, + "IdentifyCrossPassTensors": 0.00017800000205170363, + "InferInitValue": 1.3941283226013184, + "InferIntrinsicOnCC": 0.3329617977142334, + "InferNeuronTensor": 1.694812536239624, + "InferNonlocalTensors": 5.505799293518066, + "InferPSumTensor": 1.4320018291473389, + "InferShardAxis": 9.232810020446777, + "InferSharedMemLoc": 0.12128853797912598, + "InlineNativeKernels": 0.05772900581359863, + "InsertCoreBarrier": 0.16314268112182617, + "InsertIOTransposes": 0.8876657485961914, + "InsertImplicitShardAxisBeforeISel": 0.42287588119506836, + "InsertLocalTransposes": 0.7345812320709229, + "InsertOffloadedTransposes": 0.1270277500152588, + "LICM": 0.12036418914794922, + "LateLegalizeInst": 0.15407085418701172, + "LateLegalizePostSplit": 0.10393857955932617, + "LateLowerReshapeOp": 0.03914213180541992, + "LateLowerTensorOp": 0.3898463249206543, + "LateNeuronInstComb": 1.1302504539489746, + "LayoutPreprocessing": 0.9246642589569092, + "LayoutPreprocessingAndAnalysis": 1.368058681488037, + "LayoutRequirementAnalysis": 0.43490099906921387, + "LegalizeCCOpLayout": 0.12948179244995117, + "LegalizeOpLevelAlias": 0.05484366416931152, + "LegalizePartitionReduce": 0.09279680252075195, + "LegalizeSundaAccess": 0.9800224304199219, + "LegalizeSundaMacro": 0.7192654609680176, + "LegalizeType": 0.15319442749023438, + "LocalLayoutOpt": 0.6889605522155762, + "LoopFusion": 0.3984076976776123, + "LoopSplitting": 0.030361413955688477, + "LowerBroadcast": 0.11587929725646973, + "LowerCCOpBlockAxis": 0.21748971939086914, + "LowerComplexBroadcast": 0.07869768142700195, + "LowerIntrinsics": 1.3117561340332031, + "LowerShardAxis": 0.2316148281097412, + "LowerTensorOp": 0.6237733364105225, + "LowerToSendRecv": 0.16186237335205078, + "LowerTranspose": 0.4949946403503418, + "MacroGeneration": 2.6892099380493164, + "MaskPropagation": 0.11065173149108887, + "MemcastMotion": 0.00013899999612476677, + "MemcpyElimination": 8.901872634887695, + "MutateDataType": 0.04123187065124512, + "NeuronAliasDependencyInduction": 0.020069360733032227, + "NeuronAliasDependencyReset": 0.037007808685302734, + "NeuronInstComb": 0.4116497039794922, + "NeuronLICM": 0.28838109970092773, + "NeuronLoopFusion": 1.6053996086120605, + "NeuronLoopInterchange": 0.06507086753845215, + "NeuronSimplifier": 0.5044054985046387, + "NeuronSimplifyPredicates": 0.2828092575073242, + "NeuronValueNumbering": 0.12811660766601563, + "OptimizeAliasedCopyChain": 0.02622509002685547, + "OptimizeNKIKernels": 1.742466688156128, + "PAGLayoutOpt": 15.520881652832031, + "PComputeCutting": 0.3109705448150635, + "PGLayoutTilingPipeline": 40.75271987915039, + "PGTiling": 5.968484878540039, + "PadElimination": 0.019017696380615234, + "ParAxesAnnotation": 14.774547576904297, + "PartialLoopFusion": 1.2596435546875, + "PartialSimdFusion": 0.8532435894012451, + "PenguinizeFunctions": 0.0002610000083222985, + "PerfectLoopNest": 0.07212471961975098, + "PruneFunctions": 0.00015100000018719584, + "RecognizeOpIdiom": 0.12650442123413086, + "Recompute": 0.009026050567626953, + "RelaxPredicates": 0.11737942695617676, + "Rematerialization": 0.26503896713256836, + "RemoveOptimizationBarriers": 7.79999973019585e-05, + "RemoveShardedPartitionAxes": 1.2393772602081299, + "ReshapeWeights": 0.038315773010253906, + "ResolveAccessConflict": 0.1929621696472168, + "ResolveComplicatePredicates": 0.09970426559448242, + "RewriteReplicationMatmul": 0.047566890716552734, + "RewriteWeights": 0.09689998626708984, + "SFKVectorizer": 6.856188774108887, + "ScatterMotion": 0.0038960000965744257, + "ShardingPropagationAnalysis": 0.7366189956665039, + "SimpleAllReduceTiling": 0.0969705581665039, + "Simplifier": 0.09788918495178223, + "SimplifyMacroPredicates": 0.2831263542175293, + "SimplifyNeuronTensor": 0.3997683525085449, + "SimplifySlice": 0.031499624252319336, + "SimplifyTensor": 0.28774023056030273, + "SpillPSum": 0.6983683109283447, + "SplitAPUnionSets": 0.5226767063140869, + "SplitAccGrp": 0.0517582893371582, + "StaticProfiler": 0.13461875915527344, + "StaticTransposeLocalTensor": 0.24522829055786133, + "SundaISel": 1.8519799709320068, + "TCTransform": 0.03553414344787598, + "TensorInitialization": 0.22464632987976074, + "TensorOpSimplifier": 0.7551205158233643, + "TensorOpTransform": 2.3550097942352295, + "TensorizerLegalizationPass": 0.00022600000374950469, + "TileCCOps": 0.2578256130218506, + "TilingProfiler": 0.49207592010498047, + "TransformConvOp": 0.15656113624572754, + "TritiumFusion": 1.6420443058013916, + "ValueNumbering": 0.10038232803344727, + "VectorizeDMA": 0.6796724796295166, + "VectorizeMatMult": 0.08152270317077637, + "VerifySupportedOps": 0.0003929999948013574, + "WeightCoalescing": 0.0657052993774414, + "ZeroSizeTensorElimination": 0.0004067420959472656, + "algsimp": 0.0023870000150054693, + "batchnorm_expander": 0.001151000033132732, + "boundary-marker-removal": 0.0006120000034570694, + "call-inliner": 0.0003260000084992498, + "canonicalize-boundary-marker": 0.0006409999914467335, + "collective-stream-id-checker": 0.00012199999764561653, + "comparison-expander": 0.0006820000126026571, + "computation-deduplicator": 0.0006070000235922635, + "config-lowering": 0.00036700000055134296, + "constant_folding": 0.00021699999342672527, + "cse": 0.00072900002123788, + "dce": 4.5000000682193786e-05, + "dynamic-slice-transpose": 0.00016700000560376793, + "eliminate-redundant-compare": 0.0001849999971454963, + "emit-offloaded-dropout": 0.00031599999056197703, + "flatten-call-graph": 0.0005360000068321824, + "fuse-send-recv": 0.002185000106692314, + "hilo-conditional-to-select": 0.00011700000322889537, + "hilo::LegalizeAlias": 0.003029000014066696, + "hilo::NeuronInstCombine": 3.000000106112566e-06, + "hilo::NeuronOpFusion": 0.001294999965466559, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00026000000070780516, + "hilo::ScheduleFusion": 4.5000000682193786e-05, + "hilo::SixtyFourHack": 0.00040600000647827983, + "hilo::VerifyAliasing": 0.0001289999927394092, + "hlo-mac-count": 0.005212000105530024, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 1.9999999949504854e-06, + "io-layout-normalization": 0.0015899999998509884, + "legalize-ccops-for-tensorizer": 2.4000000848900527e-05, + "legalize-compare": 0.0004729999927803874, + "lower-argminmax-custom-call": 0.0001829999964684248, + "map-inline": 0.0010430000256747007, + "metadata-naming": 0.0018629999831318855, + "mlir::detail::OpToOpPassAdaptor": 0.00016700000560376793, + "mlir::hlo::MhloToPyPenguin": 0.10183499753475189, + "mlir::mhlo::LowerComplexExtraPass": 0.00419300002977252, + "mlir::mhlo::LowerComplexPass": 0.0058280001394450665, + "native-to-custom-softmax": 0.0005259999888949096, + "native-to-custom-softmax-dx": 0.0006120000034570694, + "neuron-hlo-verifier": 0.025609999895095825, + "operand_upcaster": 0.0010239999974146485, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0612110011279583, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.00029600001289509237, + "reshape-mover": 9.200000204145908e-05, + "simplify-concat": 0.0017969999462366104, + "simplify-while-loops": 7.300000288523734e-05, + "transform-variadic-reduce": 0.0009480000007897615, + "tuple-simplifier": 0.00021499999274965376, + "unpack-nested-aws-ntwsr": 0.0005779999773949385, + "unroll-while-loop": 1.2000000424450263e-05 + }, + "hilo": { + "HloMacCount": 7838334976.0, + "Traffic": 3915428608.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 166084, + "StaticProfiler::AifUb": 20.44274139404297, + "StaticProfiler::ArithmeticIntensityTensorizer": 43.45563507080078, + "StaticProfiler::AverageDmaLength": 3762.89697265625, + "StaticProfiler::DDRTransferBytes": 2723867604, + "StaticProfiler::InternalTransferBytes": 820260032, + "StaticProfiler::LoadExpanded": 578854, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 587203, + "StaticProfiler::TotalDynamicInstancesCount": 194762, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 151356, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 103585, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 41869, + "TilingProfiler::PfTransposeInstructionsForIo": 38192, + "TilingProfiler::PfTransposeInstructionsForLocal": 1715, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 565, + "TilingProfiler::SimdInstructionsAfterTiling": 3816, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 0.00019299999985378236, + "CanonicalizeForTensorizer": 0.00042100000428035855, + "Canonicalizer": 0.009693999774754047, + "HoistCompute": 8.600000001024455e-05, + "IdentifyCrossPassTensors": 0.00017800000205170363, + "MemcastMotion": 0.00013899999612476677, + "PenguinizeFunctions": 0.0002610000083222985, + "PruneFunctions": 0.00015100000018719584, + "RemoveOptimizationBarriers": 7.79999973019585e-05, + "ScatterMotion": 0.0038960000965744257, + "TensorizerLegalizationPass": 0.00022600000374950469, + "VerifySupportedOps": 0.0003929999948013574, + "algsimp": 0.0023870000150054693, + "batchnorm_expander": 0.001151000033132732, + "boundary-marker-removal": 0.0006120000034570694, + "call-inliner": 0.0003260000084992498, + "canonicalize-boundary-marker": 0.0006409999914467335, + "collective-stream-id-checker": 0.00012199999764561653, + "comparison-expander": 0.0006820000126026571, + "computation-deduplicator": 0.0006070000235922635, + "config-lowering": 0.00036700000055134296, + "constant_folding": 0.00021699999342672527, + "cse": 0.00072900002123788, + "dce": 4.5000000682193786e-05, + "dynamic-slice-transpose": 0.00016700000560376793, + "eliminate-redundant-compare": 0.0001849999971454963, + "emit-offloaded-dropout": 0.00031599999056197703, + "flatten-call-graph": 0.0005360000068321824, + "fuse-send-recv": 0.002185000106692314, + "hilo-conditional-to-select": 0.00011700000322889537, + "hilo::LegalizeAlias": 0.003029000014066696, + "hilo::NeuronInstCombine": 3.000000106112566e-06, + "hilo::NeuronOpFusion": 0.001294999965466559, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00026000000070780516, + "hilo::ScheduleFusion": 4.5000000682193786e-05, + "hilo::SixtyFourHack": 0.00040600000647827983, + "hilo::VerifyAliasing": 0.0001289999927394092, + "hlo-mac-count": 0.005212000105530024, + "io-con-pipe-begin": 9.999999747378752e-06, + "io-con-pipe-end": 1.9999999949504854e-06, + "io-layout-normalization": 0.0015899999998509884, + "legalize-ccops-for-tensorizer": 2.4000000848900527e-05, + "legalize-compare": 0.0004729999927803874, + "lower-argminmax-custom-call": 0.0001829999964684248, + "map-inline": 0.0010430000256747007, + "metadata-naming": 0.0018629999831318855, + "mlir::detail::OpToOpPassAdaptor": 0.00016700000560376793, + "mlir::hlo::MhloToPyPenguin": 0.10183499753475189, + "mlir::mhlo::LowerComplexExtraPass": 0.00419300002977252, + "mlir::mhlo::LowerComplexPass": 0.0058280001394450665, + "native-to-custom-softmax": 0.0005259999888949096, + "native-to-custom-softmax-dx": 0.0006120000034570694, + "neuron-hlo-verifier": 0.025609999895095825, + "operand_upcaster": 0.0010239999974146485, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.0612110011279583, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.00029600001289509237, + "reshape-mover": 9.200000204145908e-05, + "simplify-concat": 0.0017969999462366104, + "simplify-while-loops": 7.300000288523734e-05, + "transform-variadic-reduce": 0.0009480000007897615, + "tuple-simplifier": 0.00021499999274965376, + "unpack-nested-aws-ntwsr": 0.0005779999773949385, + "unroll-while-loop": 1.2000000424450263e-05 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0002281665802001953, + "DMALocalityOpt": 0.00018405914306640625, + "DMAProfiler": 0.0007257461547851563, + "DataStreaming": 0.0002856254577636719, + "DoNothing": 0.00012564659118652344, + "ExpandISAMacro": 0.00057220458984375, + "FactorizeBlkDims": 0.0004589557647705078, + "InferPSumTensor": 0.0006234645843505859, + "InferSharedMemLoc": 0.0002827644348144531, + "InsertCoreBarrier": 0.0002789497375488281, + "LateLegalizeInst": 0.00046324729919433594, + "LateNeuronInstComb": 0.0012440681457519531, + "LegalizeSundaAccess": 0.0020890235900878906, + "LegalizeType": 0.0002605915069580078, + "LowerBroadcast": 0.0002334117889404297, + "LowerIntrinsics": 0.00022172927856445313, + "LowerTranspose": 0.00023984909057617188, + "NeuronInstComb": 0.0006899833679199219, + "NeuronLICM": 0.00037169456481933594, + "NeuronSimplifyPredicates": 0.0023546218872070313, + "NeuronValueNumbering": 0.0004534721374511719, + "SFKVectorizer": 0.0027189254760742188, + "SimpleAllReduceTiling": 0.00021076202392578125, + "SimplifyNeuronTensor": 0.0006062984466552734, + "SpillPSum": 0.0005254745483398438, + "WeightCoalescing": 0.0002200603485107422 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 4.003819465637207, + "HloMacCount": 7838334976.0, + "Traffic": 3915428608.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 2.5843513011932373, + "AffinePredicateResolution": 0.09803915023803711, + "AliasDependencyElimination": 0.003720521926879883, + "AliasDependencyInduction": 0.7286062240600586, + "AliasDependencyReset": 0.7808692455291748, + "BFComputeCutting": 0.06884360313415527, + "BirCodeGenLoop": 2.4264121055603027, + "CCOpFusion": 0.7197604179382324, + "CanonicalizeDAGForPGTiling": 0.16963481903076172, + "CanonicalizeIR": 0.10015439987182617, + "CoalesceCCOp": 0.17328882217407227, + "CommuteConcat": 0.03352236747741699, + "DMALocalityOpt": 0.04186272621154785, + "DMAProfiler": 0.08235645294189453, + "DMATilingProfiler": 0.09742927551269531, + "DataLocalityOpt": 3.2403125762939453, + "DataStreaming": 0.1663661003112793, + "DeConcat": 0.06206154823303223, + "DeadCodeElimination": 0.0335233211517334, + "DeadStoreElimination": 1.1721632480621338, + "DelinearIndices": 0.4310429096221924, + "Delinearization": 0.1684703826904297, + "DelinearizeSPMD": 0.21332669258117676, + "DoNothing": 6.389617919921875e-05, + "DramToDramTranspose": 0.3430459499359131, + "DumpGraphAndMetadata": 0.14450669288635254, + "EliminateDivs": 0.1906135082244873, + "ExpandBatchNorm": 0.09135627746582031, + "ExpandISAMacro": 0.08912777900695801, + "FactorizeBlkDims": 0.6221714019775391, + "FactorizeThreadAxesInFreeDims": 0.08972764015197754, + "FlattenMacroLoop": 0.10975432395935059, + "GenericAccessSimplifier": 0.028026580810546875, + "InferInitValue": 1.3941283226013184, + "InferIntrinsicOnCC": 0.3329617977142334, + "InferNeuronTensor": 1.694812536239624, + "InferNonlocalTensors": 5.505799293518066, + "InferPSumTensor": 1.424811840057373, + "InferShardAxis": 9.232810020446777, + "InferSharedMemLoc": 0.11865115165710449, + "InlineNativeKernels": 0.05772900581359863, + "InsertCoreBarrier": 0.16063332557678223, + "InsertIOTransposes": 0.8876657485961914, + "InsertImplicitShardAxisBeforeISel": 0.42287588119506836, + "InsertLocalTransposes": 0.7345812320709229, + "InsertOffloadedTransposes": 0.1270277500152588, + "LICM": 0.12036418914794922, + "LateLegalizeInst": 0.14893794059753418, + "LateLegalizePostSplit": 0.10393857955932617, + "LateLowerReshapeOp": 0.03914213180541992, + "LateLowerTensorOp": 0.3898463249206543, + "LateNeuronInstComb": 1.1230978965759277, + "LayoutPreprocessing": 0.9246642589569092, + "LayoutPreprocessingAndAnalysis": 1.368058681488037, + "LayoutRequirementAnalysis": 0.43490099906921387, + "LegalizeCCOpLayout": 0.12948179244995117, + "LegalizeOpLevelAlias": 0.05484366416931152, + "LegalizePartitionReduce": 0.09279680252075195, + "LegalizeSundaAccess": 0.969397783279419, + "LegalizeSundaMacro": 0.7192654609680176, + "LegalizeType": 0.14724373817443848, + "LocalLayoutOpt": 0.6889605522155762, + "LoopFusion": 0.3984076976776123, + "LoopSplitting": 0.030361413955688477, + "LowerBroadcast": 0.11274886131286621, + "LowerCCOpBlockAxis": 0.21748971939086914, + "LowerComplexBroadcast": 0.07869768142700195, + "LowerIntrinsics": 1.3090062141418457, + "LowerShardAxis": 0.2316148281097412, + "LowerTensorOp": 0.6237733364105225, + "LowerToSendRecv": 0.16186237335205078, + "LowerTranspose": 0.49242329597473145, + "MacroGeneration": 2.6892099380493164, + "MaskPropagation": 0.11065173149108887, + "MemcpyElimination": 8.901872634887695, + "MutateDataType": 0.04123187065124512, + "NeuronAliasDependencyInduction": 0.020069360733032227, + "NeuronAliasDependencyReset": 0.037007808685302734, + "NeuronInstComb": 0.40495777130126953, + "NeuronLICM": 0.28164124488830566, + "NeuronLoopFusion": 1.6053996086120605, + "NeuronLoopInterchange": 0.06507086753845215, + "NeuronSimplifier": 0.5044054985046387, + "NeuronSimplifyPredicates": 0.27791833877563477, + "NeuronValueNumbering": 0.12485218048095703, + "OptimizeAliasedCopyChain": 0.02622509002685547, + "OptimizeNKIKernels": 1.742466688156128, + "PAGLayoutOpt": 15.520881652832031, + "PComputeCutting": 0.3109705448150635, + "PGLayoutTilingPipeline": 40.75271987915039, + "PGTiling": 5.968484878540039, + "PadElimination": 0.019017696380615234, + "ParAxesAnnotation": 14.774547576904297, + "PartialLoopFusion": 1.2596435546875, + "PartialSimdFusion": 0.8532435894012451, + "PerfectLoopNest": 0.07212471961975098, + "RecognizeOpIdiom": 0.12650442123413086, + "Recompute": 0.009026050567626953, + "RelaxPredicates": 0.11737942695617676, + "Rematerialization": 0.26503896713256836, + "RemoveShardedPartitionAxes": 1.2393772602081299, + "ReshapeWeights": 0.038315773010253906, + "ResolveAccessConflict": 0.1929621696472168, + "ResolveComplicatePredicates": 0.09970426559448242, + "RewriteReplicationMatmul": 0.047566890716552734, + "RewriteWeights": 0.09689998626708984, + "SFKVectorizer": 6.832403182983398, + "ShardingPropagationAnalysis": 0.7366189956665039, + "SimpleAllReduceTiling": 0.09433388710021973, + "Simplifier": 0.09788918495178223, + "SimplifyMacroPredicates": 0.2831263542175293, + "SimplifyNeuronTensor": 0.3514747619628906, + "SimplifySlice": 0.031499624252319336, + "SimplifyTensor": 0.28774023056030273, + "SpillPSum": 0.6855485439300537, + "SplitAPUnionSets": 0.5226767063140869, + "SplitAccGrp": 0.0517582893371582, + "StaticProfiler": 0.13461875915527344, + "StaticTransposeLocalTensor": 0.24522829055786133, + "SundaISel": 1.8519799709320068, + "TCTransform": 0.03553414344787598, + "TensorInitialization": 0.22464632987976074, + "TensorOpSimplifier": 0.7551205158233643, + "TensorOpTransform": 2.3550097942352295, + "TileCCOps": 0.2578256130218506, + "TilingProfiler": 0.49207592010498047, + "TransformConvOp": 0.15656113624572754, + "TritiumFusion": 1.6420443058013916, + "ValueNumbering": 0.10038232803344727, + "VectorizeDMA": 0.6796724796295166, + "VectorizeMatMult": 0.08152270317077637, + "WeightCoalescing": 0.06256771087646484, + "ZeroSizeTensorElimination": 0.0004067420959472656 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 166084, + "StaticProfiler::AifUb": 20.44274139404297, + "StaticProfiler::ArithmeticIntensityTensorizer": 43.45563507080078, + "StaticProfiler::AverageDmaLength": 3762.89697265625, + "StaticProfiler::AverageFractalPeUtilization": 98.57540893554688, + "StaticProfiler::AveragePartitionUtilization": 88.06752014160156, + "StaticProfiler::AveragePeUtilization": 72.09053802490234, + "StaticProfiler::DDRTransferBytes": 2723867604, + "StaticProfiler::InternalTransferBytes": 820260032, + "StaticProfiler::LoadExpanded": 578854, + "StaticProfiler::LocalizationEfficiency": 212.57244873046875, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 217.00836181640625, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 587203, + "StaticProfiler::TotalDynamicInstancesCount": 194762, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 151356, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 103585, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 41869, + "TilingProfiler::PfTransposeInstructionsForIo": 38192, + "TilingProfiler::PfTransposeInstructionsForLocal": 1715, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 565, + "TilingProfiler::SimdInstructionsAfterTiling": 3816, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.0025167465209960938, + "DMALocalityOpt": 0.0021338462829589844, + "DMAProfiler": 0.002668142318725586, + "DataStreaming": 0.0041446685791015625, + "DoNothing": 0.0001735687255859375, + "ExpandISAMacro": 0.0027213096618652344, + "FactorizeBlkDims": 0.00825810432434082, + "InferPSumTensor": 0.006566524505615234, + "InferSharedMemLoc": 0.0023546218872070313, + "InsertCoreBarrier": 0.002230405807495117, + "LateLegalizeInst": 0.004669666290283203, + "LateNeuronInstComb": 0.005908489227294922, + "LegalizeSundaAccess": 0.008535623550415039, + "LegalizeType": 0.005690097808837891, + "LowerBroadcast": 0.002897024154663086, + "LowerIntrinsics": 0.0025281906127929688, + "LowerTranspose": 0.0023314952850341797, + "NeuronInstComb": 0.006001949310302734, + "NeuronLICM": 0.006368160247802734, + "NeuronSimplifyPredicates": 0.002536296844482422, + "NeuronValueNumbering": 0.002810955047607422, + "SFKVectorizer": 0.021066904067993164, + "SimpleAllReduceTiling": 0.0024259090423583984, + "SimplifyNeuronTensor": 0.04768729209899902, + "SpillPSum": 0.012294292449951172, + "WeightCoalescing": 0.0029175281524658203 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk4/graph.neff b/token_generation_model/_tp0_bk4/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..93e5b5f83483e9053651ad4f12e1fcfbc490a2d0 --- /dev/null +++ b/token_generation_model/_tp0_bk4/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f876ca24c3808c5b51fa8c58cffc7004fbec6bbc8196380a31d0c971a6b88875 +size 8889344 diff --git a/token_generation_model/_tp0_bk4/log-neuron-cc.txt b/token_generation_model/_tp0_bk4/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b0af24edea4581a2cde1bd326fccdc3d58f729a --- /dev/null +++ b/token_generation_model/_tp0_bk4/log-neuron-cc.txt @@ -0,0 +1,4614 @@ +2025-11-04T21:38:36Z INFO 8807 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:36Z INFO 8807 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:36Z INFO 8868 [root]: XLA detected +2025-11-04T21:38:36Z INFO 8868 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:36Z INFO 8868 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4 +2025-11-04T21:38:36Z INFO 8868 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:36Z INFO 8868 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8868 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:36Z INFO 8868 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:36Z INFO 8868 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:36Z INFO 8868 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8868 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:37Z INFO 8868 [job.HLOToTensorizer.0]: Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate parameter reduce reshape rng scatter select sine slice subtract transpose tuple +2025-11-04 21:38:36.996675: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:88] Could not open file debug_info_hlo_partitions.json +2025-11-04 21:38:37.005280: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.10701 = tuple(%reshape.4385, %scatter.9929, %scatter.9944, %scatter.9957, %scatter.9972, %scatter.9985, %scatter.10000, %scatter.10013, %scatter.10028, %scatter.10041, %scatter.10056, %scatter.10069, %scatter.10084, %scatter.10097, %scatter.10112, %scatter.10125, %scatter.10140, %scatter.10153, %scatter.10168, %scatter.10181, %scatter.10196, %scatter.10209, %scatter.10224, %scatter.10237, %scatter.10252, %scatter.10265, %scatter.10280, %scatter.10293, %scatter.10308, %scatter.10321, %scatter.10336... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:37Z INFO 8868 [job.HLOToTensorizer.0]: IR signature: 839dbfa68953abfab257d6496cf99f9e15d8799acdaac89f66d8fc66ce8056ef for sg0000/HLOToTensorizer +2025-11-04T21:38:37Z INFO 8868 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:37Z INFO 8868 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:37Z INFO 8868 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:37Z INFO 8868 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:37Z INFO 8868 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:37Z INFO 8868 [job.Frontend.0]: Start model loading +2025-11-04T21:38:37Z INFO 8868 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:37Z INFO 8868 [job.Frontend.0]: Num jobs: 1 +2025-11-04T21:38:37Z USER 8868 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:37Z INFO 8868 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-11-04T21:38:37Z INFO 8868 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-11-04T21:38:39Z INFO 8868 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.055 seconds +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.026 seconds +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.179 seconds +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.157 seconds +2025-11-04T21:38:39Z INFO 8868 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:40Z INFO 8868 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8868 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.624 seconds +2025-11-04T21:38:40Z INFO 8868 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:40Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:41Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.564 seconds +2025-11-04T21:38:41Z INFO 8868 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.610 seconds +2025-11-04T21:38:41Z INFO 8868 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:41Z INFO 8868 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8868 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.129 seconds +2025-11-04T21:38:41Z INFO 8868 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.755 seconds +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.100 seconds +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.100 seconds +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.098 seconds +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.191 seconds +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.067 seconds +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:42Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.330 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.316 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.207 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.854 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.069 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.088 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.091 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.091 seconds +2025-11-04T21:38:43Z INFO 8868 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.091 seconds +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.075 seconds +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.070 seconds +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:44Z INFO 8868 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 2.017 seconds +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.336 seconds +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 2.355 seconds +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.390 seconds +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:46Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.022 seconds +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.729 seconds +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.781 seconds +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:47Z INFO 8868 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:56Z INFO 8868 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 8.509 seconds +2025-11-04T21:38:56Z INFO 8868 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:56Z INFO 8868 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.392 seconds +2025-11-04T21:38:56Z INFO 8868 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:56Z INFO 8868 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 8.902 seconds +2025-11-04T21:38:56Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:56Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:57Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 1.008 seconds +2025-11-04T21:38:57Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:58Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.348 seconds +2025-11-04T21:38:58Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:58Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.248 seconds +2025-11-04T21:38:58Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:58Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.402 seconds +2025-11-04T21:38:58Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.405 seconds +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.447 seconds +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.265 seconds +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.200 seconds +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.167 seconds +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.187 seconds +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.556 seconds +2025-11-04T21:38:59Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:00Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.865 seconds +2025-11-04T21:39:00Z INFO 8868 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.668 seconds +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.118 seconds +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.120 seconds +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/LICM]: LICM finished after 0.073 seconds +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.184 seconds +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:02Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.200 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.116 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.233 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.557 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.031 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LICM]: LICM finished after 0.058 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.110 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.104 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.216 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.125 seconds +2025-11-04T21:39:03Z INFO 8868 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LICM]: LICM finished after 0.057 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.019 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.147 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.194 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.197 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.398 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.031 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.103 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.104 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/LICM]: LICM finished after 0.055 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.100 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.036 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:04Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.034 seconds +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.126 seconds +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.127 seconds +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.099 seconds +2025-11-04T21:39:05Z INFO 8868 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.172 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.009 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.031 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.032 seconds +2025-11-04T21:39:06Z INFO 8868 [Tensorizer]: After optimization: 958 statements +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.041 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.028 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.097 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.098 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=32768 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 8) %'all_gather.1' = AllGatherOp-402 AllGather_add(bfloat16 (1024, 8) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.50 | hlo_id: 50 | , id = 402 +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: in float32 (512, 8) %'all_gather.2' = AllGatherOp-9247 AllGather_add(float32 (256, 8) %'transpose.537', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9754 | hlo_id: 9754 | , id = 9247 +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=16384 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: in uint32 (512, 8) %'all_gather.3' = AllGatherOp-9263 AllGather_add(uint32 (256, 8) %'transpose.538', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9893 | hlo_id: 9893 | , id = 9263 +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.258 seconds +2025-11-04T21:39:06Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.548 seconds +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.139 seconds +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.339 seconds +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.034 seconds +2025-11-04T21:39:07Z INFO 8868 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.039 seconds +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.333 seconds +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_1 +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_1 finished after 0.031 seconds +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.193 seconds +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/LICM]: LICM finished after 0.059 seconds +2025-11-04T21:39:08Z INFO 8868 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.689 seconds +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.382 seconds +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:09Z INFO 8868 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.278 seconds +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.131 seconds +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.925 seconds +2025-11-04T21:39:10Z INFO 8868 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:39:11Z INFO 8868 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.435 seconds +2025-11-04T21:39:11Z INFO 8868 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.368 seconds +2025-11-04T21:39:11Z INFO 8868 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:39:11Z INFO 8868 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:12Z INFO 8868 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:16Z INFO 8868 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:39:16Z INFO 8868 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 5.506 seconds +2025-11-04T21:39:16Z INFO 8868 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:39:16Z INFO 8868 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:39:17Z INFO 8868 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:39:31Z INFO 8868 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:39:31Z INFO 8868 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 14.775 seconds +2025-11-04T21:39:31Z INFO 8868 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.735 seconds +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 15.521 seconds +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.168 seconds +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.213 seconds +2025-11-04T21:39:32Z INFO 8868 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:39:33Z INFO 8868 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.737 seconds +2025-11-04T21:39:33Z INFO 8868 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:39:40Z INFO 8868 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:39:40Z INFO 8868 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 1159 +total number of sharded dags: 408 + +total bytes transferred from input, output, non local tensors: 2667063624 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 2664431748 +% bytes transferred with 2x bandwidths: 99.90 + +NC0 FLOPs: 7849649305 +NC1 FLOPs: 7842310016 +% FLOPs sharded: 99.95 + + +Shard dim: 2048, Number of dags: 197 +Matmuls sharded with this dim: +[2,2,64] @ [2,64,2048(s)] = [2,2048(s)] Number of occurrences: 28 +[2,2048(s)] @ [2048(s),128] = [2,128] Number of occurrences: 28 + + +Shard dim: 2, Number of dags: 196 +Matmuls sharded with this dim: +[8,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [8,8,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,128] = [8,2,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,2,64] = [8,2,2,2,2,64] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,64] = [8,2,2,2,64] Number of occurrences: 28 +[8,2,2,2,128] @ [2,2,2,128,2(s),2,4,128] = [8,2(s),2,4,128] Number of occurrences: 28 +[8,2,8,128] @ [2,8,128,2(s),6,2,128] = [8,2(s),6,2,128] Number of occurrences: 56 + + +Shard dim: 256, Number of dags: 10 +Matmuls sharded with this dim: + + +Shard dim: 8, Number of dags: 2 +Matmuls sharded with this dim: + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 512, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[8,2,8,128] @ [2,8,128,75968(s)] = [8,75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:39:40Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:41Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:41Z INFO 8868 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.431 seconds +2025-11-04T21:39:41Z INFO 8868 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 1.239 seconds +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 9.233 seconds +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.111 seconds +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.170 seconds +2025-11-04T21:39:42Z INFO 8868 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:43Z INFO 8868 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:43Z INFO 8868 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.217 seconds +2025-11-04T21:39:43Z INFO 8868 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:39:43Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (79, 'AG3736'), (80, 'AG3735'), (218, 'AG3727'), (474, 'AG3726'), (274, 'AG3733')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9810 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (83, 'AG3752'), (84, 'AG3751'), (218, 'AG3727'), (474, 'AG3726'), (272, 'AG3749')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10061 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (87, 'AG3768'), (88, 'AG3767'), (218, 'AG3727'), (474, 'AG3726'), (270, 'AG3765')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10312 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (91, 'AG3784'), (92, 'AG3783'), (218, 'AG3727'), (474, 'AG3726'), (268, 'AG3781')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (95, 'AG3800'), (96, 'AG3799'), (218, 'AG3727'), (474, 'AG3726'), (266, 'AG3797')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10814 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (99, 'AG3816'), (100, 'AG3815'), (218, 'AG3727'), (474, 'AG3726'), (264, 'AG3813')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11065 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (103, 'AG3832'), (104, 'AG3831'), (218, 'AG3727'), (474, 'AG3726'), (262, 'AG3829')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11316 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (107, 'AG3848'), (108, 'AG3847'), (218, 'AG3727'), (474, 'AG3726'), (260, 'AG3845')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (111, 'AG3864'), (112, 'AG3863'), (218, 'AG3727'), (474, 'AG3726'), (258, 'AG3861')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11818 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (115, 'AG3880'), (116, 'AG3879'), (218, 'AG3727'), (474, 'AG3726'), (256, 'AG3877')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12069 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (119, 'AG3896'), (120, 'AG3895'), (218, 'AG3727'), (474, 'AG3726'), (254, 'AG3893')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12320 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (123, 'AG3912'), (124, 'AG3911'), (218, 'AG3727'), (474, 'AG3726'), (252, 'AG3909')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12571 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (127, 'AG3928'), (128, 'AG3927'), (218, 'AG3727'), (474, 'AG3726'), (250, 'AG3925')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12822 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (131, 'AG3944'), (132, 'AG3943'), (218, 'AG3727'), (474, 'AG3726'), (248, 'AG3941')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13073 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (135, 'AG3960'), (136, 'AG3959'), (218, 'AG3727'), (474, 'AG3726'), (246, 'AG3957')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13324 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (139, 'AG3976'), (140, 'AG3975'), (218, 'AG3727'), (474, 'AG3726'), (244, 'AG3973')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13575 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (143, 'AG3992'), (144, 'AG3991'), (218, 'AG3727'), (474, 'AG3726'), (242, 'AG3989')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13826 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (147, 'AG4008'), (148, 'AG4007'), (218, 'AG3727'), (474, 'AG3726'), (240, 'AG4005')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14077 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (151, 'AG4024'), (152, 'AG4023'), (218, 'AG3727'), (474, 'AG3726'), (238, 'AG4021')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14328 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (155, 'AG4040'), (156, 'AG4039'), (218, 'AG3727'), (474, 'AG3726'), (236, 'AG4037')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (159, 'AG4056'), (160, 'AG4055'), (218, 'AG3727'), (474, 'AG3726'), (234, 'AG4053')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14830 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (163, 'AG4072'), (164, 'AG4071'), (218, 'AG3727'), (474, 'AG3726'), (232, 'AG4069')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15081 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (167, 'AG4088'), (168, 'AG4087'), (218, 'AG3727'), (474, 'AG3726'), (230, 'AG4085')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15332 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (171, 'AG4104'), (172, 'AG4103'), (218, 'AG3727'), (474, 'AG3726'), (228, 'AG4101')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (175, 'AG4120'), (176, 'AG4119'), (218, 'AG3727'), (474, 'AG3726'), (226, 'AG4117')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15834 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (179, 'AG4136'), (180, 'AG4135'), (218, 'AG3727'), (474, 'AG3726'), (224, 'AG4133')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16085 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (183, 'AG4152'), (184, 'AG4151'), (218, 'AG3727'), (474, 'AG3726'), (222, 'AG4149')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16336 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(8, 2, 2, 2, 2, 1024, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (187, 'AG4168'), (188, 'AG4167'), (218, 'AG3727'), (474, 'AG3726'), (220, 'AG4165')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23397 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(78, 'AG3741'), (273, 'AG3740'), (79, 'AG3736'), (80, 'AG3735'), (81, 'AG3734'), (358, 'AG3739'), (470, 'AG3738')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (471, 'AG3737')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23387 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23392 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23388 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(79, 'AG3736'), (191, 'AG3731'), (80, 'AG3735'), (81, 'AG3734'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(469, 'AG3742'), (74, 'AG3744'), (357, 'AG3743')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23401 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(37, 'AG3748'), (1, 'AG3745'), (356, 'AG3747'), (468, 'AG3746')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23400 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23398 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23399 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23412 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(82, 'AG3757'), (271, 'AG3756'), (83, 'AG3752'), (84, 'AG3751'), (85, 'AG3750'), (355, 'AG3755'), (466, 'AG3754')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (467, 'AG3753')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23402 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23407 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23403 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(83, 'AG3752'), (191, 'AG3731'), (84, 'AG3751'), (85, 'AG3750'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(465, 'AG3758'), (75, 'AG3760'), (354, 'AG3759')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23416 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(38, 'AG3764'), (2, 'AG3761'), (353, 'AG3763'), (464, 'AG3762')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23415 of IO tensor {'CrossPassTensor': ''}bfloat16 %input80|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23413 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23414 of IO tensor {'CrossPassTensor': ''}bfloat16 %input82|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23427 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(86, 'AG3773'), (269, 'AG3772'), (87, 'AG3768'), (88, 'AG3767'), (89, 'AG3766'), (352, 'AG3771'), (462, 'AG3770')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (463, 'AG3769')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23417 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23422 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23418 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(87, 'AG3768'), (191, 'AG3731'), (88, 'AG3767'), (89, 'AG3766'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(461, 'AG3774'), (76, 'AG3776'), (351, 'AG3775')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23431 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(39, 'AG3780'), (3, 'AG3777'), (350, 'AG3779'), (460, 'AG3778')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23430 of IO tensor {'CrossPassTensor': ''}bfloat16 %input91|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23428 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23429 of IO tensor {'CrossPassTensor': ''}bfloat16 %input93|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(90, 'AG3789'), (267, 'AG3788'), (91, 'AG3784'), (92, 'AG3783'), (93, 'AG3782'), (349, 'AG3787'), (458, 'AG3786')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (459, 'AG3785')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23432 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23437 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23433 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(91, 'AG3784'), (191, 'AG3731'), (92, 'AG3783'), (93, 'AG3782'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(457, 'AG3790'), (192, 'AG3792'), (348, 'AG3791')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23446 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(40, 'AG3796'), (4, 'AG3793'), (347, 'AG3795'), (456, 'AG3794')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23457 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(94, 'AG3805'), (265, 'AG3804'), (95, 'AG3800'), (96, 'AG3799'), (97, 'AG3798'), (346, 'AG3803'), (454, 'AG3802')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (455, 'AG3801')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23447 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23452 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23448 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(95, 'AG3800'), (191, 'AG3731'), (96, 'AG3799'), (97, 'AG3798'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(453, 'AG3806'), (193, 'AG3808'), (345, 'AG3807')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23461 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(41, 'AG3812'), (5, 'AG3809'), (344, 'AG3811'), (452, 'AG3810')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23472 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(98, 'AG3821'), (263, 'AG3820'), (99, 'AG3816'), (100, 'AG3815'), (101, 'AG3814'), (343, 'AG3819'), (450, 'AG3818')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (451, 'AG3817')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23462 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(99, 'AG3816'), (191, 'AG3731'), (100, 'AG3815'), (101, 'AG3814'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(449, 'AG3822'), (194, 'AG3824'), (342, 'AG3823')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23476 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(42, 'AG3828'), (6, 'AG3825'), (341, 'AG3827'), (448, 'AG3826')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23473 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23487 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(102, 'AG3837'), (261, 'AG3836'), (103, 'AG3832'), (104, 'AG3831'), (105, 'AG3830'), (340, 'AG3835'), (446, 'AG3834')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (447, 'AG3833')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23477 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23482 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23478 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(103, 'AG3832'), (191, 'AG3731'), (104, 'AG3831'), (105, 'AG3830'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(445, 'AG3838'), (195, 'AG3840'), (339, 'AG3839')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23491 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(43, 'AG3844'), (7, 'AG3841'), (338, 'AG3843'), (444, 'AG3842')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23488 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23502 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(106, 'AG3853'), (259, 'AG3852'), (107, 'AG3848'), (108, 'AG3847'), (109, 'AG3846'), (337, 'AG3851'), (442, 'AG3850')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (443, 'AG3849')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23492 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23497 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23493 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(107, 'AG3848'), (191, 'AG3731'), (108, 'AG3847'), (109, 'AG3846'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(441, 'AG3854'), (196, 'AG3856'), (336, 'AG3855')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23506 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(44, 'AG3860'), (8, 'AG3857'), (335, 'AG3859'), (440, 'AG3858')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23503 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23517 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(110, 'AG3869'), (257, 'AG3868'), (111, 'AG3864'), (112, 'AG3863'), (113, 'AG3862'), (334, 'AG3867'), (438, 'AG3866')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (439, 'AG3865')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23507 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23512 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23508 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(111, 'AG3864'), (191, 'AG3731'), (112, 'AG3863'), (113, 'AG3862'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(437, 'AG3870'), (197, 'AG3872'), (333, 'AG3871')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23521 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG3876'), (9, 'AG3873'), (332, 'AG3875'), (436, 'AG3874')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23518 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23532 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(114, 'AG3885'), (255, 'AG3884'), (115, 'AG3880'), (116, 'AG3879'), (117, 'AG3878'), (331, 'AG3883'), (434, 'AG3882')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (435, 'AG3881')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23522 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23527 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23523 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(115, 'AG3880'), (191, 'AG3731'), (116, 'AG3879'), (117, 'AG3878'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(433, 'AG3886'), (198, 'AG3888'), (330, 'AG3887')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23536 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(46, 'AG3892'), (10, 'AG3889'), (329, 'AG3891'), (432, 'AG3890')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23533 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23547 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(118, 'AG3901'), (253, 'AG3900'), (119, 'AG3896'), (120, 'AG3895'), (121, 'AG3894'), (328, 'AG3899'), (430, 'AG3898')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (431, 'AG3897')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23537 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23542 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23538 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(119, 'AG3896'), (191, 'AG3731'), (120, 'AG3895'), (121, 'AG3894'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(429, 'AG3902'), (199, 'AG3904'), (327, 'AG3903')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23551 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(47, 'AG3908'), (11, 'AG3905'), (326, 'AG3907'), (428, 'AG3906')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23548 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(122, 'AG3917'), (251, 'AG3916'), (123, 'AG3912'), (124, 'AG3911'), (125, 'AG3910'), (325, 'AG3915'), (426, 'AG3914')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (427, 'AG3913')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23552 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23557 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23553 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(123, 'AG3912'), (191, 'AG3731'), (124, 'AG3911'), (125, 'AG3910'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(425, 'AG3918'), (200, 'AG3920'), (324, 'AG3919')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23566 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(48, 'AG3924'), (12, 'AG3921'), (323, 'AG3923'), (424, 'AG3922')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23577 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(126, 'AG3933'), (249, 'AG3932'), (127, 'AG3928'), (128, 'AG3927'), (129, 'AG3926'), (322, 'AG3931'), (422, 'AG3930')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (423, 'AG3929')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23567 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23572 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23568 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(127, 'AG3928'), (191, 'AG3731'), (128, 'AG3927'), (129, 'AG3926'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(421, 'AG3934'), (201, 'AG3936'), (321, 'AG3935')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23581 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(49, 'AG3940'), (13, 'AG3937'), (320, 'AG3939'), (420, 'AG3938')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23578 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(130, 'AG3949'), (247, 'AG3948'), (131, 'AG3944'), (132, 'AG3943'), (133, 'AG3942'), (319, 'AG3947'), (418, 'AG3946')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (419, 'AG3945')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23582 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23587 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23583 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG3944'), (191, 'AG3731'), (132, 'AG3943'), (133, 'AG3942'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(417, 'AG3950'), (202, 'AG3952'), (318, 'AG3951')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23596 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(50, 'AG3956'), (14, 'AG3953'), (317, 'AG3955'), (416, 'AG3954')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23607 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(134, 'AG3965'), (245, 'AG3964'), (135, 'AG3960'), (136, 'AG3959'), (137, 'AG3958'), (316, 'AG3963'), (414, 'AG3962')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (415, 'AG3961')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23597 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23602 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23598 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(135, 'AG3960'), (191, 'AG3731'), (136, 'AG3959'), (137, 'AG3958'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(413, 'AG3966'), (203, 'AG3968'), (315, 'AG3967')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23611 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(51, 'AG3972'), (15, 'AG3969'), (314, 'AG3971'), (412, 'AG3970')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23608 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23622 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(138, 'AG3981'), (243, 'AG3980'), (139, 'AG3976'), (140, 'AG3975'), (141, 'AG3974'), (313, 'AG3979'), (410, 'AG3978')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (411, 'AG3977')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23612 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23617 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23613 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(139, 'AG3976'), (191, 'AG3731'), (140, 'AG3975'), (141, 'AG3974'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(409, 'AG3982'), (204, 'AG3984'), (312, 'AG3983')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23626 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(52, 'AG3988'), (16, 'AG3985'), (311, 'AG3987'), (408, 'AG3986')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23623 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(142, 'AG3997'), (241, 'AG3996'), (143, 'AG3992'), (144, 'AG3991'), (145, 'AG3990'), (310, 'AG3995'), (406, 'AG3994')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (407, 'AG3993')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23627 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23632 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23628 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(143, 'AG3992'), (191, 'AG3731'), (144, 'AG3991'), (145, 'AG3990'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(405, 'AG3998'), (205, 'AG4000'), (309, 'AG3999')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23641 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(53, 'AG4004'), (17, 'AG4001'), (308, 'AG4003'), (404, 'AG4002')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23640 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23652 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(146, 'AG4013'), (239, 'AG4012'), (147, 'AG4008'), (148, 'AG4007'), (149, 'AG4006'), (307, 'AG4011'), (402, 'AG4010')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23651 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (403, 'AG4009')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23642 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23647 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23643 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(147, 'AG4008'), (191, 'AG3731'), (148, 'AG4007'), (149, 'AG4006'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(401, 'AG4014'), (206, 'AG4016'), (306, 'AG4015')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23656 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(54, 'AG4020'), (18, 'AG4017'), (305, 'AG4019'), (400, 'AG4018')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23655 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23653 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23654 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23667 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(150, 'AG4029'), (237, 'AG4028'), (151, 'AG4024'), (152, 'AG4023'), (153, 'AG4022'), (304, 'AG4027'), (398, 'AG4026')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23666 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (399, 'AG4025')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23657 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23662 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23658 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(151, 'AG4024'), (191, 'AG3731'), (152, 'AG4023'), (153, 'AG4022'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(397, 'AG4030'), (207, 'AG4032'), (303, 'AG4031')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23671 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(55, 'AG4036'), (19, 'AG4033'), (302, 'AG4035'), (396, 'AG4034')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23670 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23668 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23669 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23682 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(154, 'AG4045'), (235, 'AG4044'), (155, 'AG4040'), (156, 'AG4039'), (157, 'AG4038'), (301, 'AG4043'), (394, 'AG4042')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23681 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (395, 'AG4041')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23672 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23677 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23673 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(155, 'AG4040'), (191, 'AG3731'), (156, 'AG4039'), (157, 'AG4038'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(393, 'AG4046'), (208, 'AG4048'), (300, 'AG4047')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23686 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(56, 'AG4052'), (20, 'AG4049'), (299, 'AG4051'), (392, 'AG4050')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23685 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23683 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23684 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23697 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(158, 'AG4061'), (233, 'AG4060'), (159, 'AG4056'), (160, 'AG4055'), (161, 'AG4054'), (298, 'AG4059'), (390, 'AG4058')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23696 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (391, 'AG4057')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23687 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23692 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23688 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(159, 'AG4056'), (191, 'AG3731'), (160, 'AG4055'), (161, 'AG4054'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(389, 'AG4062'), (209, 'AG4064'), (297, 'AG4063')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23701 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(57, 'AG4068'), (21, 'AG4065'), (296, 'AG4067'), (388, 'AG4066')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23700 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23699 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23712 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(162, 'AG4077'), (231, 'AG4076'), (163, 'AG4072'), (164, 'AG4071'), (165, 'AG4070'), (295, 'AG4075'), (386, 'AG4074')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23711 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (387, 'AG4073')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23707 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23703 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(163, 'AG4072'), (191, 'AG3731'), (164, 'AG4071'), (165, 'AG4070'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(385, 'AG4078'), (210, 'AG4080'), (294, 'AG4079')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23716 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(58, 'AG4084'), (22, 'AG4081'), (293, 'AG4083'), (384, 'AG4082')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23715 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23713 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23727 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(166, 'AG4093'), (229, 'AG4092'), (167, 'AG4088'), (168, 'AG4087'), (169, 'AG4086'), (292, 'AG4091'), (382, 'AG4090')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23726 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (383, 'AG4089')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23717 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(167, 'AG4088'), (191, 'AG3731'), (168, 'AG4087'), (169, 'AG4086'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(381, 'AG4094'), (211, 'AG4096'), (291, 'AG4095')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23731 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(59, 'AG4100'), (23, 'AG4097'), (290, 'AG4099'), (380, 'AG4098')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23730 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23728 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23729 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23742 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(170, 'AG4109'), (227, 'AG4108'), (171, 'AG4104'), (172, 'AG4103'), (173, 'AG4102'), (289, 'AG4107'), (378, 'AG4106')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23741 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (379, 'AG4105')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23732 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23737 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23733 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(171, 'AG4104'), (191, 'AG3731'), (172, 'AG4103'), (173, 'AG4102'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(377, 'AG4110'), (212, 'AG4112'), (288, 'AG4111')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23746 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(60, 'AG4116'), (24, 'AG4113'), (287, 'AG4115'), (376, 'AG4114')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23745 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23743 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23744 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23757 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(174, 'AG4125'), (225, 'AG4124'), (175, 'AG4120'), (176, 'AG4119'), (177, 'AG4118'), (286, 'AG4123'), (374, 'AG4122')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23756 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (375, 'AG4121')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23747 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23752 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23748 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(175, 'AG4120'), (191, 'AG3731'), (176, 'AG4119'), (177, 'AG4118'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(373, 'AG4126'), (213, 'AG4128'), (285, 'AG4127')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23761 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(61, 'AG4132'), (25, 'AG4129'), (284, 'AG4131'), (372, 'AG4130')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23760 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23758 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23759 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23772 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(178, 'AG4141'), (223, 'AG4140'), (179, 'AG4136'), (180, 'AG4135'), (181, 'AG4134'), (283, 'AG4139'), (370, 'AG4138')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23771 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (371, 'AG4137')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23762 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23767 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23763 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(179, 'AG4136'), (191, 'AG3731'), (180, 'AG4135'), (181, 'AG4134'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(369, 'AG4142'), (214, 'AG4144'), (282, 'AG4143')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23776 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(62, 'AG4148'), (26, 'AG4145'), (281, 'AG4147'), (368, 'AG4146')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23775 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23773 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23774 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23787 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(182, 'AG4157'), (221, 'AG4156'), (183, 'AG4152'), (184, 'AG4151'), (185, 'AG4150'), (280, 'AG4155'), (366, 'AG4154')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23786 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (367, 'AG4153')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23777 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23782 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23778 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(183, 'AG4152'), (191, 'AG3731'), (184, 'AG4151'), (185, 'AG4150'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(365, 'AG4158'), (215, 'AG4160'), (279, 'AG4159')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23791 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(63, 'AG4164'), (27, 'AG4161'), (278, 'AG4163'), (364, 'AG4162')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23790 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23788 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23789 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23802 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(186, 'AG4173'), (219, 'AG4172'), (187, 'AG4168'), (188, 'AG4167'), (189, 'AG4166'), (277, 'AG4171'), (362, 'AG4170')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23801 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (363, 'AG4169')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23792 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23797 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23793 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(187, 'AG4168'), (191, 'AG3731'), (188, 'AG4167'), (189, 'AG4166'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(361, 'AG4174'), (216, 'AG4176'), (276, 'AG4175')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23806 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(64, 'AG4180'), (28, 'AG4177'), (275, 'AG4179'), (360, 'AG4178')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23805 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23803 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23804 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23339 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(190, 'AG4182'), (217, 'AG4181'), (191, 'AG3731')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23807 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23385 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23386 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16621 of IO tensor non_local float32 %get_tuple_element.3(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23361 of IO tensor non_local uint32 %get_tuple_element.4(8, 2, 128) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16596 of IO tensor non_local int32 %gather.2|NC|(8, 256) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16639 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16656 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 2.584 seconds +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.245 seconds +2025-11-04T21:39:45Z INFO 8868 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.311 seconds +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.069 seconds +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.030 seconds +2025-11-04T21:39:46Z INFO 8868 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:39:49Z INFO 8868 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:39:49Z INFO 8868 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.689 seconds +2025-11-04T21:39:49Z INFO 8868 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 5.968 seconds +2025-11-04T21:39:49Z INFO 8868 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:39:49Z INFO 8868 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:39:49Z INFO 8868 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.888 seconds +2025-11-04T21:39:49Z INFO 8868 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.127 seconds +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.343 seconds +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 40.753 seconds +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.492 seconds +2025-11-04T21:39:50Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:51Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.321 seconds +2025-11-04T21:39:51Z INFO 8868 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:39:51Z INFO 8868 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:39:52Z INFO 8868 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 1.637 seconds +2025-11-04T21:39:52Z INFO 8868 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.056 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.695 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.309 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.310 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/LICM]: LICM finished after 0.086 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.048 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.184 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.121 seconds +2025-11-04T21:39:53Z INFO 8868 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 3.240 seconds +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: transpose_128x64 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 1024: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.097 seconds +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.353 seconds +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.354 seconds +2025-11-04T21:39:57Z INFO 8868 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:39:58Z INFO 8868 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:39:58Z INFO 8868 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.719 seconds +2025-11-04T21:39:58Z INFO 8868 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:39:58Z INFO 8868 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:39:58Z INFO 8868 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.423 seconds +2025-11-04T21:39:58Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:58Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.520 seconds +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.521 seconds +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 1.042 seconds +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.072 seconds +2025-11-04T21:39:59Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.230 seconds +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.097 seconds +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.038 seconds +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.110 seconds +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.283 seconds +2025-11-04T21:40:00Z INFO 8868 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:40:01Z INFO 8868 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8868 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.394 seconds +2025-11-04T21:40:01Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:40:01Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.504 seconds +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.504 seconds +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.077 seconds +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=True) +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.288 seconds +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/LICM]: LICM finished after 0.120 seconds +2025-11-04T21:40:02Z INFO 8868 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.852 seconds +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.004 seconds +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.020 seconds +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.037 seconds +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.079 seconds +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.073 seconds +2025-11-04T21:40:04Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:05Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.359 seconds +2025-11-04T21:40:05Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:40:05Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:40:05Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.577 seconds +2025-11-04T21:40:05Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.278 seconds +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.252 seconds +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.248 seconds +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_4 +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_4 finished after 0.248 seconds +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 1.605 seconds +2025-11-04T21:40:06Z INFO 8868 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:07Z INFO 8868 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:40:07Z INFO 8868 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.065 seconds +2025-11-04T21:40:07Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:07Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:07Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.365 seconds +2025-11-04T21:40:07Z INFO 8868 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:08Z INFO 8868 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:40:08Z INFO 8868 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.622 seconds +2025-11-04T21:40:08Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:08Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 1.048 seconds +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.204 seconds +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_2 +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_2 finished after 0.183 seconds +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.445 seconds +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.125 seconds +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.202 seconds +2025-11-04T21:40:09Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.194 seconds +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.405 seconds +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.097 seconds +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.628 seconds +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.051 seconds +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.680 seconds +2025-11-04T21:40:10Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.264 seconds +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.093 seconds +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.062 seconds +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.062 seconds +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.090 seconds +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:40:11Z INFO 8868 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:40:12Z INFO 8868 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.853 seconds +2025-11-04T21:40:12Z INFO 8868 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:40:12Z INFO 8868 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.853 seconds +2025-11-04T21:40:12Z INFO 8868 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:40:13Z INFO 8868 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:40:13Z INFO 8868 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.642 seconds +2025-11-04T21:40:13Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:13Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.585 seconds +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.585 seconds +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.082 seconds +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:40:14Z INFO 8868 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:40:15Z INFO 8868 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 1.259 seconds +2025-11-04T21:40:15Z INFO 8868 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:40:15Z INFO 8868 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 1.260 seconds +2025-11-04T21:40:15Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.197 seconds +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.492 seconds +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.113 seconds +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:16Z INFO 8868 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.935 seconds +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.179 seconds +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 1.123 seconds +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.052 seconds +2025-11-04T21:40:17Z INFO 8868 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:18Z INFO 8868 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:18Z INFO 8868 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.686 seconds +2025-11-04T21:40:18Z INFO 8868 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:19Z INFO 8868 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:40:19Z INFO 8868 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 1.309 seconds +2025-11-04T21:40:19Z INFO 8868 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:40:19Z INFO 8868 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:40:19Z INFO 8868 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.058 seconds +2025-11-04T21:40:19Z INFO 8868 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:20Z INFO 8868 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8868 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.147 seconds +2025-11-04T21:40:20Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:20Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8868 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.282 seconds +2025-11-04T21:40:20Z INFO 8868 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:20Z INFO 8868 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.757 seconds +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.667 seconds +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.425 seconds +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.063 seconds +2025-11-04T21:40:21Z INFO 8868 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:22Z INFO 8868 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:22Z INFO 8868 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.969 seconds +2025-11-04T21:40:22Z INFO 8868 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:40:22Z INFO 8868 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:40:22Z INFO 8868 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.117 seconds +2025-11-04T21:40:22Z INFO 8868 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.225 seconds +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.278 seconds +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.089 seconds +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.036 seconds +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.351 seconds +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.042 seconds +2025-11-04T21:40:23Z INFO 8868 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:24Z INFO 8868 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:40:24Z INFO 8868 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.166 seconds +2025-11-04T21:40:24Z INFO 8868 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:28Z INFO 8868 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:30Z INFO 8868 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 2.351 seconds +2025-11-04T21:40:30Z INFO 8868 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:40:30Z INFO 8868 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.248 seconds +2025-11-04T21:40:30Z INFO 8868 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:30Z INFO 8868 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 6.832 seconds +2025-11-04T21:40:30Z INFO 8868 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.149 seconds +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.173 seconds +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.094 seconds +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.161 seconds +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 10.710% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'38147.56459'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1,i2.16,i1.128] # id=56458, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_38147 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 230.909us (2.344MiB, est bw: 10.643GB/s, 1.624% of tot. time) for float32<8 x 128> non_local float32 (8, 2, 37984) %'convert.656'[i1.8,i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1] = store float32<8 x 128> TongaSB partitions[2] float32 (2, 297, 8, 128) %'38660.56469'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i1.8,i0.128] # id=56467, src_id=None, , instances=600 # dl = tensor_op_name: convert.656_pftranspose_38660 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i1.8];[i0.128]] -> [[i1.8];[i0.128]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 138.776us (32.031MiB, est bw: 242.024GB/s, 0.976% of tot. time) for bfloat16<128 x 8200> TongaSB partitions[2] bfloat16 (2, 8, 128, 8200) %'all_gather.1_nostride_60933'(init=0.0)[i242_0_0_42945,T_i2,i0.128,i1.8200] = load bfloat16<128 x 8200> non_local bfloat16 (16384,) %'all_gather.1'[8i0.128+1024T_i2+i1.8200] # id=48225, src_id=None, , attrs={'can_read_uninit': True}, instances=16 # dl = tensor_op_name: _add.383 | hlo_id: 383 | [[i0.128];[i1.8200]] -> [[i0.128];[i1.8200]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 0.697% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 8, 128) %'input5_local_39953'[i160_25349_0_42922,2i183_0_0+i183_0_1,i183_1_0_0,i183_1_0_1,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 2, 8, 128, 128) %'input5'[2i183_0_0+i183_0_1,i183_1_0_0,i183_1_0_1,0,i160_25349_0_42922,i2.8,i0.128,i1.128] # id=48194, src_id=None, , instances=64 # dl = tensor_op_name: _dot.354 | hlo_id: 13489 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 0.697% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 8, 128) %'input7_local_40064'[i359_25350_0_42996,2i382_0_0+i382_0_1,i382_1_0_0,i382_1_0_1,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 2, 8, 128, 128) %'input7'[2i382_0_0+i382_0_1,i382_1_0_0,i382_1_0_1,0,i359_25350_0_42996,i2.8,i0.128,i1.128] # id=48394, src_id=None, , instances=64 # dl = tensor_op_name: _dot.698 | hlo_id: 13600 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 0.697% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 8, 128) %'input9_local_40167'[i531_25351_0_43070,2i554_0_0+i554_0_1,i554_1_0_0,i554_1_0_1,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 2, 8, 128, 128) %'input9'[2i554_0_0+i554_0_1,i554_1_0_0,i554_1_0_1,0,i531_25351_0_43070,i2.8,i0.128,i1.128] # id=48590, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1042 | hlo_id: 13711 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 0.697% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 8, 128) %'input11_local_40270'[i703_25352_0_43144,2i726_0_0+i726_0_1,i726_1_0_0,i726_1_0_1,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 2, 8, 128, 128) %'input11'[2i726_0_0+i726_0_1,i726_1_0_0,i726_1_0_1,0,i703_25352_0_43144,i2.8,i0.128,i1.128] # id=48786, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1386 | hlo_id: 13822 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 0.697% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 8, 128) %'input13_local_40373'[i875_25353_0_43218,2i898_0_0+i898_0_1,i898_1_0_0,i898_1_0_1,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 2, 8, 128, 128) %'input13'[2i898_0_0+i898_0_1,i898_1_0_0,i898_1_0_1,0,i875_25353_0_43218,i2.8,i0.128,i1.128] # id=48982, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1730 | hlo_id: 13933 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 0.697% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 8, 128) %'input15_local_40476'[i1047_25354_0_43292,2i1070_0_0+i1070_0_1,i1070_1_0_0,i1070_1_0_1,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 2, 8, 128, 128) %'input15'[2i1070_0_0+i1070_0_1,i1070_1_0_0,i1070_1_0_1,0,i1047_25354_0_43292,i2.8,i0.128,i1.128] # id=49178, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2074 | hlo_id: 14044 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 99.175us (16.000MiB, est bw: 169.167GB/s, 0.697% of tot. time) for bfloat16<128 x 1024> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 8, 128) %'input17_local_40579'[i1219_25355_0_43366,2i1242_0_0+i1242_0_1,i1242_1_0_0,i1242_1_0_1,i0.128,i2.8,i1.128] = load bfloat16<128 x 1024> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 2, 8, 128, 128) %'input17'[2i1242_0_0+i1242_0_1,i1242_1_0_0,i1242_1_0_1,0,i1219_25355_0_43366,i2.8,i0.128,i1.128] # id=49374, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2418 | hlo_id: 14155 | [[i0.128];[i1.128, i2.8]] -> [[i0.128];[i1.128, i2.8]] +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.082 seconds +2025-11-04T21:40:31Z INFO 8868 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.017 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.009 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.009 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.010 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:31Z INFO 8868 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.047 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.026 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 5004) %4(init=0.0)[i0.128,i1.4748] = load float32<128 x 4748> float32 (128, 4748) %6[i0.128,i1.4748] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 4748) %10[i0.128,i1.4748] = load float32<128 x 4748> float32 (8, 75968) %'inp'[i0.128,i1.4748] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 5.874% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.008 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.012 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.048 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.021 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 12.331% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 288) %4(init=0.0)[i0.128,i1.32] = load float32<128 x 32> float32 (128, 32) %6[i0.128,i1.32] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 32) %10[i0.128,i1.32] = load float32<128 x 32> float32 (8, 512) %'inp'[i0.128,i1.32] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8868 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8868 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8868 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 1.742 seconds +2025-11-04T21:40:33Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:33Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:34Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.888 seconds +2025-11-04T21:40:34Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.888 seconds +2025-11-04T21:40:34Z INFO 8868 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:40:34Z WARNING 8868 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 86.24 percent of all matmul computation +2025-11-04T21:40:34Z INFO 8868 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:40:34Z INFO 8868 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.135 seconds +2025-11-04T21:40:34Z INFO 8868 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.523 seconds +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.104 seconds +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.119 seconds +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.232 seconds +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:35Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.719 seconds +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.720 seconds +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.145 seconds +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.162 seconds +2025-11-04T21:40:36Z INFO 8868 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:39Z INFO 8868 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:39Z INFO 8868 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.503 seconds +2025-11-04T21:40:40Z INFO 8868 [Tensorizer]: BirCodeGen estimate #instances=111463 in sg0000 +2025-11-04T21:40:40Z INFO 8868 [Tensorizer]: IR signature: 118795129421928c96996f8962016aef874e28589277e29f8ab69bce9acf08a1 for nc00/sg0000/TensorizerBIR +2025-11-04T21:40:40Z INFO 8868 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:42Z INFO 8868 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:42Z INFO 8868 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.426 seconds +2025-11-04T21:40:44Z INFO 8868 [Tensorizer]: BirCodeGen estimate #instances=111463 in sg0000 +2025-11-04T21:40:44Z INFO 8868 [Tensorizer]: IR signature: f402370e2f3559a09a624e8f190865367a7f8dda16016f96530815351f84ac69 for nc01/sg0000/TensorizerBIR +2025-11-04T21:40:44Z INFO 8868 [Tensorizer]: Weights total number of bytes: 2810120 +2025-11-04T21:40:44Z INFO 8868 [Tensorizer]: Successfully built model. +2025-11-04T21:40:44Z USER 8868 [root/Tensorizer/Tensorizer]: Tensorizer finished after 127.040 seconds +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: End tensorization +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input0 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input1 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input2 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input3 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input4 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input5 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input6 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input7 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input8 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input9 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input10 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input11 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input12 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input13 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input14 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input15 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input16 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input17 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input18 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input19 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input20 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input21 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input22 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input23 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input24 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input25 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input26 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input27 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input28 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input29 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input30 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input31 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input32 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input33 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input34 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input35 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input36 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input37 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input38 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input39 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input40 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input41 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input42 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input43 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input44 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input45 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input46 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input47 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input48 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input49 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input50 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input51 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input52 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input53 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input54 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input55 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input56 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input57 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input58 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input59 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input60 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input61 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input62 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input63 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input64 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input65 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input66 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input67 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input68 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input69 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input70 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input71 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input72 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input73 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input74 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input75 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input76 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input77 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input78 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input79 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input80 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input81 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input82 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input83 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input84 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input85 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input86 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input87 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input88 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input89 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input90 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input91 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input92 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input93 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input94 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input95 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input96 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input97 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input98 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input99 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input100 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input101 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input102 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input103 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input104 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input105 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input106 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input107 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input108 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input109 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input110 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input111 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input112 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input113 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input114 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input115 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input116 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input117 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input118 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input119 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input120 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input121 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input122 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input123 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input124 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input125 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input126 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input127 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input128 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input129 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input130 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input131 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input132 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input133 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input134 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input135 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input136 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input137 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input138 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input139 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input140 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input141 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input142 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input143 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input144 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input145 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input146 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input147 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input148 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input149 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input150 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input151 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input152 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input153 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input154 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input155 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input156 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input157 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input158 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input159 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input160 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input161 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input162 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input163 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input164 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input165 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input166 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input167 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input168 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input169 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input170 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input171 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input172 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input173 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input174 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input175 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input176 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input177 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input178 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input179 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input180 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input181 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input182 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input183 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input184 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input185 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input186 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input187 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input188 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input189 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input190 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input191 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input192 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input193 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input194 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input195 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input196 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input197 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input198 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input199 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input200 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input201 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input202 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input203 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input204 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input205 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input206 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input207 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input208 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input209 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input210 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input211 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input212 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input213 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input214 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input215 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input216 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input217 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input218 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input219 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input220 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input221 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input222 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input223 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input224 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input225 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input226 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input227 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input228 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input229 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input230 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input231 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input232 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input233 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input234 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input235 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input236 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input237 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input238 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input239 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input240 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input241 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input242 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input243 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input244 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input245 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input246 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input247 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input248 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input249 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input250 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input251 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input252 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input253 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input254 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input255 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input256 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input257 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input258 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input259 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input260 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input261 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input262 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input263 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input264 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input265 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input266 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input267 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input268 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input269 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input270 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input271 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input272 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input273 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input274 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input275 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input276 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input277 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input278 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input279 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input280 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input281 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input282 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input283 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input284 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input285 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input286 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input287 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input288 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input289 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input290 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input291 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input292 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input293 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input294 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input295 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input296 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input297 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input298 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input299 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input300 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input301 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input302 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input303 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input304 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input305 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input306 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input307 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input308 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input309 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input310 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input311 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input312 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input313 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input314 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input315 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input316 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input317 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input318 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input319 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input320 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input321 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input322 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input323 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input324 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input325 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input326 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input327 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input328 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input329 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input330 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input331 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input332 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input333 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input334 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input335 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input336 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input337 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input338 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input339 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input340 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input341 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input342 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input343 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input344 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input345 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input346 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input347 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input348 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input349 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input350 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input351 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input352 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input353 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input354 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input355 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input356 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input357 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input358 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input359 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input360 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input361 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input362 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input363 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input364 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input365 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input366 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input367 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input368 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input369 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Network input: input370 +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:44Z INFO 8868 [job.Frontend.0]: Job #0 finished +2025-11-04T21:40:44Z INFO 8868 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:40:44Z INFO 8868 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:40:44Z INFO 8868 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:40:44Z INFO 8868 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: BackendDriver has 2 states with 2 core LNC +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: BackendDriver: no partitions within VNC found. Switching to VNC + flat flow. +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: BackendDriver in_state.num_states 2 with 2 core LNC +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/log-neuron-cc.txt --vnc-nc-per-sengine 2 --link-subgraphs nc00/sg00,nc01/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --unified-backend-and-legacy-codegen --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels vector_dynamic_offsets,scalar_dynamic_offset,spill_reload,io --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:40:44Z INFO 8868 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:40:44Z INFO 9610 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:40:44Z INFO 9610 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:40:44Z INFO 9610 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:40:44Z INFO 9610 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:40:45Z INFO 9610 [BackendDriver]: Backend driver mtBackend: false numModules: 2 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt" +2025-11-04T21:40:45Z INFO 9610 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:40:45Z INFO 9610 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:40:45Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:45Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12176 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:45Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:45Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9610 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.002 seconds +2025-11-04T21:40:45Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 457mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9610 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.003 seconds +2025-11-04T21:40:45Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 457mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:45Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:45Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:45Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:45Z WARNING 9610 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.363.63889}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:45Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.363.63889}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:46Z USER 9610 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.246 seconds +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 810mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.286 seconds +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 832mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:46Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.303 seconds +2025-11-04T21:40:46Z INFO 9610 [BackendPassManager]: curr_vmrss: 832mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:46Z USER 9610 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:46Z INFO 9610 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=12176 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:46Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=12176 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.008 seconds +2025-11-04T21:40:46Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 832mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 12176 memory location(s), 2 block(s), and 10056 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:46Z USER 9610 [BackendPassManager]: subgraph_parallel_pass finished after 0.020 seconds +2025-11-04T21:40:46Z INFO 9610 [BackendPassManager]: curr_vmrss: 832mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:46Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:46Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12176 blocks=2 instructions=10056 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:46Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:46Z USER 9610 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.002 seconds +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 832mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:46Z USER 9610 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.003 seconds +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 832mb, ru_maxrss: 924mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5028 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z INFO 9610 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:46 2025 +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6088 blocks=1 instructions=5028 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z INFO 9610 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:46 2025 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:46 2025 + +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Total count: 98428 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Matmult: 76517 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: GenericCopy: 13857 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Load: 2364 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: TensorScalarPtr: 1890 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: TensorTensor: 1330 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Save: 338 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Memset: 275 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Select: 226 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: TensorReduce: 63 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Iota: 58 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: DMACopy: 10 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 10 +2025-11-04T21:40:48Z USER 9610 (nc01/sg00) [ModuleForkPass]: unroll finished after 2.377 seconds +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2084mb, ru_maxrss: 2084mb (delta=1160mb) +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 54916 memory location(s), 1 block(s), and 98428 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:48Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=54916 blocks=1 instructions=98428 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:46 2025 + +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Total count: 99588 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Matmult: 76517 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: GenericCopy: 13857 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: TensorScalarPtr: 2450 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Load: 2364 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: TensorTensor: 1330 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Activation: 733 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Iota: 394 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Save: 378 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Memset: 275 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: DMACopy: 234 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Select: 226 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: TensorReduce: 63 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 234 +2025-11-04T21:40:48Z USER 9610 (nc00/sg00) [ModuleForkPass]: unroll finished after 2.617 seconds +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1827mb, ru_maxrss: 2084mb (delta=1160mb) +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 54916 memory location(s), 1 block(s), and 99588 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:48Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=54916 blocks=1 instructions=99588 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9610 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:48Z INFO 9610 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.308 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1827mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.314 seconds +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1826mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 2.956 seconds +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: curr_vmrss: 1826mb, ru_maxrss: 2084mb (delta=1160mb) +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=49356 blocks=2 instructions=196416 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:49Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=49356 blocks=2 instructions=196416 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.012 seconds +2025-11-04T21:40:49Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 1826mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49356 memory location(s), 2 block(s), and 196416 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: subgraph_parallel_pass finished after 0.022 seconds +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: curr_vmrss: 1826mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49356 blocks=2 instructions=196416 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47751_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47760_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47769_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47778_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47787_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47796_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47805_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47814_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47823_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47832_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47841_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47850_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47859_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47868_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47877_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47886_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47895_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47904_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47913_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47922_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47931_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47940_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47949_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47958_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47967_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47976_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47985_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t47994_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45152_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:49Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45157_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.299 seconds +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.308 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.323 seconds +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=49356 blocks=2 instructions=196416 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:49Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=49356 blocks=2 instructions=196416 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.064 seconds +2025-11-04T21:40:49Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49356 memory location(s), 2 block(s), and 196416 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: subgraph_parallel_pass finished after 0.090 seconds +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:49Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49356 blocks=2 instructions=196416 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.025 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.028 seconds +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.017 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.020 seconds +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.104 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.014 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z WARNING 9610 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.001 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.120 seconds +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.017 seconds +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z WARNING 9610 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.002 seconds +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 5 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:49Z INFO 9610 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:49Z INFO 9610 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.003 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.038 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.045 seconds +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.120 seconds +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:49Z INFO 9610 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:40:49Z INFO 9610 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:49Z INFO 9610 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:49Z INFO 9610 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.003 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.043 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.048 seconds +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.129 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1832mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.486 seconds +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1833mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.027 seconds +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1833mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.515 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1833mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.028 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1833mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.076 seconds +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1833mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.092 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.283 seconds +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.018 seconds +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.226 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.015 seconds +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23950 memory location(s), 1 block(s), and 96839 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=23950 blocks=1 instructions=96839 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:50 2025 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.017 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.014 seconds +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1835mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25406 memory location(s), 1 block(s), and 99577 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=25406 blocks=1 instructions=99577 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:50 2025 +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: To Spill 9 multi-layer tensors +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:50Z INFO 9610 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:50 2025 +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: To Spill 10 multi-layer tensors +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:50Z INFO 9610 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:50Z INFO 9610 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:50 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [build_flow_deps]: Allocs: 23970 instructions: 96859 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [build_flow_deps]: Allocs: 25428 instructions: 99595 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 246992 edges +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [build_flow_deps]: Done build fdeps 246992 Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 265177 edges +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [build_flow_deps]: Done build fdeps 265177 Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z USER 9610 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 0.975 seconds +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1881mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23970 memory location(s), 1 block(s), and 96859 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:51Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:51Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=23970 blocks=1 instructions=96859 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:51Z INFO 9610 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:51 2025 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:52 2025 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 1.170 seconds +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1493mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25428 memory location(s), 1 block(s), and 99595 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=25428 blocks=1 instructions=99595 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 31 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.281 seconds +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1497mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23969 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=23969 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.001 seconds +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1497mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23970 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=23970 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.001 seconds +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1497mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.012 seconds +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1497mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.012 seconds +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1497mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: size = 14282 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.336 seconds +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25397 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=25397 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.003 seconds +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25398 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: found 35573 edges +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: mean: 4.98152 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: median: 6.63414 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 284584 bytes +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=25398 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.002 seconds +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.014 seconds +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.014 seconds +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: lo = 14208 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: total = 14282 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.345 seconds +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1501mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: size = 14454 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.175 seconds +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1503mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: found 35631 edges +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: mean: 4.93026 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: median: 6.61081 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 285048 bytes +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: lo = 14380 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: total = 14454 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:52Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:52Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:53Z USER 9610 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.522 seconds +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1506mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:53Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 363 PSUM Banks +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:53Z USER 9610 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.258 seconds +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1509mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:53Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 546 PSUM Banks +2025-11-04T21:40:53Z USER 9610 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.627 seconds +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1511mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:53Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1352321614 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4665 bytes +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1779712 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 377 bytes +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: size = 9109 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: found 13663 accumulation groups +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: largest = _dot.9689-t44936_i20 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:53Z INFO 9610 []: find first defs for local +2025-11-04T21:40:53Z INFO 9610 []: find first defs for global +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: 2346 remat count +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:53Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 314 PSUM Banks +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: Num intervals 9109 Num locations 9109 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: edge: 181746 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: mean: 39.9047 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: median: 24.4662 +2025-11-04T21:40:53Z INFO 9610 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: safe = 8713 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: unsafe = 392 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: inf = 2 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: total = 9107 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 9109 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Total: 9107 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (9107) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Rover zone: 0.919 (8371) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.069 (626) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.012 (110) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.098 (892) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (1) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.758 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.000 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.000 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.902 (8214) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.574 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.567 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 619 PSUM Banks +2025-11-04T21:40:54Z USER 9610 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 0.952 seconds +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1484mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1352321614 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4665 bytes +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1779712 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 377 bytes +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:54Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:54Z USER 9610 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 0.830 seconds +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1484mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:54Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1357421238 +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 4669 bytes +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2812010 +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 507 bytes +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: size = 10357 +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:54Z USER 9610 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.243 seconds +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1487mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23971 memory location(s), 1 block(s), and 96858 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:54Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=23971 blocks=1 instructions=96858 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1354101326, 98.2521% input load, 0% output write, 1.74785% spill/reload [sg0000] +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: found 13835 accumulation groups +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: largest = _dot.9689-t44936_i11 +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: requires 8448 bytes/partition +2025-11-04T21:40:54Z INFO 9610 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:54Z INFO 9610 []: find first defs for local +2025-11-04T21:40:54Z INFO 9610 []: find first defs for global +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:54Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: 2355 remat count +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Num intervals 10357 Num locations 10357 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: edge: 217156 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: mean: 41.9342 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: median: 26.1617 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.33043e+09) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: safe = 9799 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: unsafe = 398 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: inf = 158 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: total = 10355 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 7 #Pinned 0 #Safe 0 minCost 0.0251235 maxCost 0.0251235 locations 10357 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: new candidates = 3 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 4 spill/reload instructions +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 4 spill/reload memory locations +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Total: 10355 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (10355) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Rover zone: 0.893 (9250) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.060 (625) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.046 (476) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (4) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.090 (930) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.043 (447) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.307 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.306 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.702 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.867 (8978) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.526 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.250 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 6144, 0.0259594% out of total spill/reload dma traffic +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1357421238 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 4669 bytes +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2812010 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 507 bytes +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:55Z USER 9610 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 1.628 seconds +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1499mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:55Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:55Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:55Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:56Z USER 9610 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.207 seconds +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1503mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25399 memory location(s), 1 block(s), and 99564 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:56Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=25399 blocks=1 instructions=99564 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1360233248, 97.9988% input load, 2.35254e-06% output write, 2.00121% spill/reload [sg0000] +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 12 SpillSaves and Reloads +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 4674 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 428 bytes +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 65536, 0.004818% out of total dma traffic(1.33301e+09) +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 4 SpillSaves and Reloads +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 4676 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 442 bytes +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 4 spill/reload instructions +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 4 spill/reload memory locations +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 4676 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 442 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1352318542 +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4676 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1776640 +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 442 bytes +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload instructions +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 1]: removed 0 spill/reload memory locations +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 6144, 0.0225707% out of total spill/reload dma traffic +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 6144, 0.000453733% out of total dma traffic +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1354095182, 98.2526% input load, 0% output write, 1.74741% spill/reload [sg0000] +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1352318542 +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4676 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1776640 +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 442 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 16416 +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 7 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4581 bytes +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:56Z USER 9610 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 2.454 seconds +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1508mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 96846 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:56Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:56Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=23953 blocks=1 instructions=96846 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:56Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:57Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 169 Sb address +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:40:57Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1026 Sb address +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 12 SpillSaves and Reloads +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 4680 bytes +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 565 bytes +2025-11-04T21:40:57Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 226 Sb address +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 4 SpillSaves and Reloads +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 4682 bytes +2025-11-04T21:40:57Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 580 bytes +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 35 Sb address +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 0 SpillSaves and Reloads +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 4682 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 580 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1357352630 +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 4682 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2808938 +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 580 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1474 Sb address +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 71680, 0.00526968% out of total dma traffic +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1360161568, 97.9991% input load, 2.35266e-06% output write, 2.00086% spill/reload [sg0000] +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1357352630 +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 4682 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2808938 +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 580 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 482400 +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 85 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 4529 bytes +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:40:58Z USER 9610 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 2.398 seconds +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1516mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99551 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:58Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=25380 blocks=1 instructions=99551 Max writers: 298 Max Readers: 21965 +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:40:58Z USER 9610 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 1.714 seconds +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1518mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 193 Sb address +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 96846 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:58Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=23953 blocks=1 instructions=96846 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: reserved space = 166144 bytes +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: spill space = 72704 bytes +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 77824 bytes +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:40:58Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: size = 4 +2025-11-04T21:40:58Z INFO 9610 []: find first defs for local +2025-11-04T21:40:59Z INFO 9610 []: find first defs for global +2025-11-04T21:40:59Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 991 Sb address +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: Num intervals 4 Num locations 4 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: lo = 4 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: total = 4 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 77824 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.470 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1526mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 96846 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=23953 blocks=1 instructions=96846 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 74752 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 74752 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.183 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1530mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 96846 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=23953 blocks=1 instructions=96846 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 2749 out of 13845 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.041 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1540mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 96846 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=23953 blocks=1 instructions=96846 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.061 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1540mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 97072, number of allocs: 23953 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.02431 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.017 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1540mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.014 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1540mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.013 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1540mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.097 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1541mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.018 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1541mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47751_i1}@SB<0,26448>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47760_i1}@SB<0,23184>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47769_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47778_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47787_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47796_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47805_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47814_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47823_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47832_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47841_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47850_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47859_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47868_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47877_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47886_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47895_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47904_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47913_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47922_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47931_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47940_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47949_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47958_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47967_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47976_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47985_i1}@SB<0,18760>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t47994_i1}@SB<0,23752>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_29149_45152_i1}@SB<32,16552>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:59Z WARNING 9610 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_29153_45157_i1}@SB<96,17672>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.241 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1543mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.020 seconds +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1543mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:40:59 2025 +2025-11-04T21:40:59Z INFO 9610 (nc01/sg00) [build_flow_deps]: Allocs: 23953 instructions: 97072 +2025-11-04T21:40:59Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 257 Sb address +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 247433 edges +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [build_flow_deps]: Done build fdeps 247433 Tue Nov 4 21:41:00 2025 +2025-11-04T21:41:00Z USER 9610 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.350 seconds +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1565mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:00Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:41:00Z USER 9610 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.069 seconds +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1565mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:00Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:00Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:00Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 21 Sb address +2025-11-04T21:41:00Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1987 Sb address +2025-11-04T21:41:01Z USER 9610 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.775 seconds +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1722mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:01Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.683 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1722mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99551 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=25380 blocks=1 instructions=99551 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:41:01Z USER 9610 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.190 seconds +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1722mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:01Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=23953 blocks=1 instructions=97072 Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: reserved space = 166152 bytes +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: spill space = 105536 bytes +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 118784 bytes +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: size = 10 +2025-11-04T21:41:01Z INFO 9610 []: find first defs for local +2025-11-04T21:41:01Z USER 9610 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.083 seconds +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1722mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 []: find first defs for global +2025-11-04T21:41:01Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 23953 memory location(s), 1 block(s), and 97072 instruction(s). Max writers: 298 Max Readers: 21181 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: Num intervals 10 Num locations 10 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: lo = 10 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: total = 10 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 81920 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.385 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1722mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99551 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=25380 blocks=1 instructions=99551 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 81920 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 81920 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.218 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1723mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99551 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=25380 blocks=1 instructions=99551 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 2861 out of 14090 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.041 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1724mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99551 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=25380 blocks=1 instructions=99551 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.068 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1724mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 99777, number of allocs: 25380 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.013525 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.018 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1724mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.015 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1724mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.015 seconds +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1724mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:41:01Z INFO 9610 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.120 seconds +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1724mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.023 seconds +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1724mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.254 seconds +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1725mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.019 seconds +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1725mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:41:02 2025 +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [build_flow_deps]: Allocs: 25380 instructions: 99777 +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 265588 edges +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [build_flow_deps]: Done build fdeps 265588 Tue Nov 4 21:41:02 2025 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.292 seconds +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1748mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.054 seconds +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1748mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:02Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:03Z USER 9610 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.720 seconds +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:41:03Z USER 9610 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.188 seconds +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=25380 blocks=1 instructions=99777 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.089 seconds +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:03Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25380 memory location(s), 1 block(s), and 99777 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:03Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 14.085 seconds +2025-11-04T21:41:03Z INFO 9610 [BackendPassManager]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:03Z USER 9610 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:03Z INFO 9610 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=49333 blocks=2 instructions=196849 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:41:03Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=49333 blocks=2 instructions=196849 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.010 seconds +2025-11-04T21:41:03Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:03Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49333 memory location(s), 2 block(s), and 196849 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:41:03Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=49333 blocks=2 instructions=196849 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.079 seconds +2025-11-04T21:41:03Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:03Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 197763 instruction(s). Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:03Z USER 9610 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:41:03Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=49731 blocks=2 instructions=197763 Max writers: 298 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.333 seconds +2025-11-04T21:41:04Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 197767 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: subgraph_parallel_pass finished after 0.449 seconds +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: curr_vmrss: 1818mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197767 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:41:04Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=24152 blocks=1 instructions=97531 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: reserved space = 271688 bytes +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: size = 132 +2025-11-04T21:41:04Z INFO 9610 []: find first defs for local +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:41:04Z INFO 9610 []: find first defs for global +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: reserved space = 238848 bytes +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:41:04Z USER 9610 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.286 seconds +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97531 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: Num intervals 132 Num locations 132 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: lo = 132 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: total = 132 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 81920 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 81920 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 3850240 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 3850240 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 6303744 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:41:04Z USER 9610 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.417 seconds +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.430 seconds +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197767 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:41:04Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=49731 blocks=2 instructions=197767 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.007 seconds +2025-11-04T21:41:04Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 197767 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: subgraph_parallel_pass finished after 0.015 seconds +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197767 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:41:04Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=24152 blocks=1 instructions=97531 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:04Z USER 9610 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.076 seconds +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.098 seconds +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97531 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:04Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.105 seconds +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: curr_vmrss: 1820mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:04Z USER 9610 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:04Z INFO 9610 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197767 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:04Z USER 9610 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:41:04Z USER 9610 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:41:04Z INFO 9610 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=24152 blocks=1 instructions=97531 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:04Z INFO 9610 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:05Z USER 9610 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.301 seconds +2025-11-04T21:41:05Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 1821mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:05Z USER 9610 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.423 seconds +2025-11-04T21:41:05Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 1821mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97531 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:05Z USER 9610 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:05Z USER 9610 [BackendPassManager]: nc_parallel_pass finished after 0.443 seconds +2025-11-04T21:41:05Z INFO 9610 [BackendPassManager]: curr_vmrss: 1821mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:05Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:05Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197767 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:05Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:41:05Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:41:05Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=24152 blocks=1 instructions=97531 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:05Z USER 9610 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.003 seconds +2025-11-04T21:41:05Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1821mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:05Z USER 9610 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.005 seconds +2025-11-04T21:41:05Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1821mb, ru_maxrss: 2084mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97531 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:05Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:41:05Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=24152 blocks=1 instructions=97531 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:05Z INFO 9610 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:41:05Z INFO 9610 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:41:05Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:05Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:41:05Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:05Z INFO 9610 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:41:05Z INFO 9610 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:41:05Z INFO 9610 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:41:05 2025 +2025-11-04T21:41:05Z INFO 9610 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:41:05 2025 +2025-11-04T21:41:06Z INFO 9610 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:41:06Z INFO 9610 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:41:09Z INFO 9610 [post_scheduler]: Time-aware simulation time: 7970729 +2025-11-04T21:41:09Z INFO 9610 [post_scheduler]: Time-aware simulation time: 8382851 +2025-11-04T21:41:09Z INFO 9610 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:09 2025 +2025-11-04T21:41:09Z USER 9610 (nc01/sg00) [ModuleForkPass]: post_sched finished after 4.772 seconds +2025-11-04T21:41:09Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2207mb, ru_maxrss: 2207mb (delta=123mb) +2025-11-04T21:41:09Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97531 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:09Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:10Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=24152 blocks=1 instructions=97531 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:10Z USER 9610 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.017 seconds +2025-11-04T21:41:10Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1995mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97531 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:10Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:10Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=24152 blocks=1 instructions=97531 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:10Z USER 9610 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.097 seconds +2025-11-04T21:41:10Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2004mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97499 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:10Z INFO 9610 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:10 2025 +2025-11-04T21:41:10Z USER 9610 (nc00/sg00) [ModuleForkPass]: post_sched finished after 4.981 seconds +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2000mb, ru_maxrss: 2207mb (delta=123mb) +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.019 seconds +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1969mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.101 seconds +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1971mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:10Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 5.131 seconds +2025-11-04T21:41:10Z INFO 9610 [BackendPassManager]: curr_vmrss: 1971mb, ru_maxrss: 2207mb (delta=123mb) +2025-11-04T21:41:10Z USER 9610 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:10Z INFO 9610 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197735 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:41:10Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=49731 blocks=2 instructions=197735 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.009 seconds +2025-11-04T21:41:10Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 1971mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 197735 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:10Z USER 9610 [BackendPassManager]: subgraph_parallel_pass finished after 0.018 seconds +2025-11-04T21:41:10Z INFO 9610 [BackendPassManager]: curr_vmrss: 1971mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:10Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:10Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197735 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:10Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:10Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:10Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=24152 blocks=1 instructions=97499 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:10Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:11Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 8535 PSUM Banks +2025-11-04T21:41:11Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 8367 PSUM Banks +2025-11-04T21:41:11Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 308 PSUM Banks +2025-11-04T21:41:11Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 282 PSUM Banks +2025-11-04T21:41:12Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 6527 PSUM Banks +2025-11-04T21:41:12Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 6274 PSUM Banks +2025-11-04T21:41:12Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 8 Sb address +2025-11-04T21:41:12Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 65 Sb address +2025-11-04T21:41:12Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 22 Sb address +2025-11-04T21:41:12Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 25 Sb address +2025-11-04T21:41:12Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 108 Sb address +2025-11-04T21:41:12Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 168 Sb address +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 65 Sb address +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 98 Sb address +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2189 Sb address +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: moved 15 MM forward +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2787 Sb address +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: moved 3 MM forward +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 7 Sb address +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:41:13Z USER 9610 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 3.346 seconds +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1981mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97499 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:13Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=24152 blocks=1 instructions=97499 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:13Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 9 Sb address +2025-11-04T21:41:13Z USER 9610 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 3.492 seconds +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2010mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:13Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:13Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:14Z USER 9610 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.872 seconds +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2183mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97499 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:14Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=24152 blocks=1 instructions=97499 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:14Z USER 9610 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.094 seconds +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2046mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97499 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:14Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=24152 blocks=1 instructions=97499 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:41:14 2025 +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [build_flow_deps]: Allocs: 24152 instructions: 97499 +2025-11-04T21:41:14Z USER 9610 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.905 seconds +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2039mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:14Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:14Z USER 9610 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.113 seconds +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1943mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:14Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:41:14 2025 +2025-11-04T21:41:14Z INFO 9610 (nc00/sg00) [build_flow_deps]: Allocs: 25579 instructions: 100236 +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 249931 edges +2025-11-04T21:41:14Z INFO 9610 (nc01/sg00) [build_flow_deps]: Done build fdeps 249931 Tue Nov 4 21:41:14 2025 +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.395 seconds +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1950mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97499 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=24152 blocks=1 instructions=97499 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬───────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼───────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 177 │ 131072 │ +│ Load │ Const -> Internal │ 5 │ 165120 │ +│ Load │ ExternalInput -> Internal │ 2261 │ 1330268224 │ +│ Load │ Internal │ 69 │ 1937414 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 337 │ 1776640 │ +└──────────────┴───────────────────────────┴───────┴────────────┘ + +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 66 │ +│ 8 │ 5 │ +│ 16 │ 6 │ +│ 32 │ 60 │ +│ 64 │ 3 │ +│ 88 │ 3 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 304 │ +│ 1024 │ 2 │ +│ 2048 │ 981 │ +│ 4096 │ 298 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 728 │ +│ 16400 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 75669 #MatMult-Transposes 21228 +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ReportStats]: IO Tensor size combined: 5790025920 +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60831_i1 │ Internal │ bfloat16 │ 3153920 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ all_gather.1_nostride_60933_i10 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i12 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i9 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i11 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i14 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i13 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i8 │ Internal │ bfloat16 │ 2099200 │ +└─────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.035 seconds +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1950mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97499 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 268761 edges +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [build_flow_deps]: Done build fdeps 268761 Tue Nov 4 21:41:15 2025 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.389 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1950mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 208 │ 139264 │ +│ DMACopy │ Internal -> ExternalOutput │ 224 │ 7516192768 │ +│ Load │ Const -> Internal │ 10 │ 2678024 │ +│ Load │ ExternalInput -> Internal │ 2262 │ 1330268256 │ +│ Load │ Internal │ 81 │ 4458566 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 382 │ 2808906 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 32 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 74 │ +│ 8 │ 6 │ +│ 16 │ 6 │ +│ 32 │ 61 │ +│ 64 │ 7 │ +│ 88 │ 3 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 305 │ +│ 1024 │ 17 │ +│ 2048 │ 981 │ +│ 4096 │ 326 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 728 │ +│ 16384 │ 2 │ +│ 16400 │ 8 │ +│ 18992 │ 2 │ +│ 1048576 │ 224 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 76513 #MatMult-Transposes 22012 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ReportStats]: IO Tensor size combined: 5790025920 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60831_i0 │ Internal │ bfloat16 │ 3153920 │ +│ -t80793 │ Internal │ float32 │ 2562048 │ +│ -t80787 │ Internal │ float32 │ 2562048 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ -t80790 │ Internal │ float32 │ 2430976 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ all_gather.1_nostride_60933_i3 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i2 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i1 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60933_i0 │ Internal │ bfloat16 │ 2099200 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.038 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1950mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100236 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 4.974 seconds +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: curr_vmrss: 1950mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Inputs to assign_trigger_engine: modules=2 functions=2 allocs=49731 blocks=2 instructions=197735 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 453 DMA instructions. Moved 71 DMA instructions to CC's engines. +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 407 DMA instructions. Moved 70 DMA instructions to CC's engines. +2025-11-04T21:41:15Z INFO 9610 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: assign_trigger_engine finished after 0.116 seconds +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 197735 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197735 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=24152 blocks=1 instructions=97499 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=25579 blocks=1 instructions=100236 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.030 seconds +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.031 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.037 seconds +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=2 functions=2 allocs=49731 blocks=2 instructions=197853 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: assign_hwdge_engine finished after 0.032 seconds +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 197853 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197853 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 7 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 226 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 192 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 134 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 26 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2267 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 7 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.017 seconds +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 18 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 260 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 290 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 74 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 33 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2492 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 9 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.018 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.028 seconds +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.029 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.055 seconds +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197853 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:15Z USER 9610 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:15Z INFO 9610 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.002 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z USER 9610 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.002 seconds +2025-11-04T21:41:15Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z USER 9610 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: nc_parallel_pass finished after 0.006 seconds +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197853 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.001 seconds +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.002 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:15Z USER 9610 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.144 seconds +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z USER 9610 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.157 seconds +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.166 seconds +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: curr_vmrss: 1951mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:15Z USER 9610 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:15Z INFO 9610 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=197853 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z USER 9610 (nc00) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:15Z USER 9610 (nc01) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:15Z INFO 9610 (nc00) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:15Z INFO 9610 (nc01) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:15Z INFO 9610 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 81263 +2025-11-04T21:41:15Z INFO 9610 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 92063 +2025-11-04T21:41:16Z INFO 9610 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 85063 +2025-11-04T21:41:16Z INFO 9610 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 85063 +2025-11-04T21:41:16Z INFO 9610 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 96932 +2025-11-04T21:41:16Z INFO 9610 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 96932 +2025-11-04T21:41:16Z INFO 9610 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:16Z INFO 9610 (nc01/sg00) [DepReduction]: Finished dependency reduction: 581139 removed, new total 31408 +2025-11-04T21:41:16Z INFO 9610 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:16Z USER 9610 (nc01) [CoreForkPass]: dep_reduction finished after 1.218 seconds +2025-11-04T21:41:16Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2095mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:16Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:16Z USER 9610 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:16Z INFO 9610 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:16Z USER 9610 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.020 seconds +2025-11-04T21:41:16Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2091mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:16Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:16Z USER 9610 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:16Z INFO 9610 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.061 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2099mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.016 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2104mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97558 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=24152 blocks=1 instructions=97558 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 2144/2144 (100% DGE) + power-of-2 partition : 2144/2149 (99.7673% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2145/2150 (99.7674% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 33/33 (100% DGE) + power-of-2 partition : 33/423 (7.80142% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 33/423 (7.80142% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 169 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 9/9 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: lower_dma finished after 0.075 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2116mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97559 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=24152 blocks=1 instructions=97559 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: expand_all_engine finished after 0.020 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2116mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97559 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=24152 blocks=1 instructions=97559 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [DepReduction]: Finished dependency reduction: 675600 removed, new total 33000 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: dep_reduction finished after 1.483 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2108mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.021 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2106mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.135 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97559 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=24152 blocks=1 instructions=97559 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.063 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.016 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100295 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=25579 blocks=1 instructions=100295 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 2144/2144 (100% DGE) + power-of-2 partition : 2144/2151 (99.6746% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2145/2152 (99.6747% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 42/42 (100% DGE) + power-of-2 partition : 42/485 (8.65979% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 42/485 (8.65979% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 197 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 234/234 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: lower_dma finished after 0.081 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100297 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=25579 blocks=1 instructions=100297 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: expand_all_engine finished after 0.020 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100297 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=25579 blocks=1 instructions=100297 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: expand_inst_late finished after 0.154 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97568 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=24152 blocks=1 instructions=97568 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [SeqInstOpt]: Removing 7 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.016 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 97561 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=24152 blocks=1 instructions=97561 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: lower_sync finished after 0.041 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2105mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 101231 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=24152 blocks=1 instructions=101231 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: lower_act finished after 0.019 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2106mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 101372 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=24152 blocks=1 instructions=101372 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.144 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2107mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100297 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=25579 blocks=1 instructions=100297 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: expand_inst_late finished after 0.168 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2135mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100531 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=25579 blocks=1 instructions=100531 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [SeqInstOpt]: Removing 230 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.018 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2136mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 100301 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=25579 blocks=1 instructions=100301 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: lower_sync finished after 0.046 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2150mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 104408 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: lower_dve finished after 0.289 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2136mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=25579 blocks=1 instructions=104408 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 101372 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=24152 blocks=1 instructions=101372 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: lower_act finished after 0.022 seconds +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2109mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 104550 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z USER 9610 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:17Z INFO 9610 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=25579 blocks=1 instructions=104550 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:17Z INFO 9610 (nc00/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: lower_ap finished after 0.022 seconds +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2109mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 101372 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z USER 9610 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:17Z INFO 9610 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=24152 blocks=1 instructions=101372 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:17Z INFO 9610 (nc01/sg00) [REG_Allocator]: size = 2 +2025-11-04T21:41:17Z INFO 9610 []: find first defs for local reg +2025-11-04T21:41:17Z INFO 9610 []: find first defs for global reg +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: lo = 2 +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: total = 2 +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:18Z USER 9610 (nc00) [CoreForkPass]: lower_dve finished after 0.297 seconds +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2179mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z USER 9610 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.295 seconds +2025-11-04T21:41:18Z INFO 9610 (nc01) [CoreForkPass]: curr_vmrss: 2113mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 104550 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=25579 blocks=1 instructions=104550 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z INFO 9610 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 101372 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:18Z USER 9610 (nc00) [CoreForkPass]: lower_ap finished after 0.024 seconds +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2111mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 104550 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=25579 blocks=1 instructions=104550 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: size = 4 +2025-11-04T21:41:18Z INFO 9610 []: find first defs for local reg +2025-11-04T21:41:18Z INFO 9610 []: find first defs for global reg +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: lo = 4 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: total = 4 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:18Z USER 9610 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.318 seconds +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: curr_vmrss: 2136mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 104550 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: nc_parallel_pass finished after 2.848 seconds +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: curr_vmrss: 2084mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: vnc_remote_addr_map finished after 0.013 seconds +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: curr_vmrss: 2015mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 205922 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: Running vnc_link +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z INFO 9610 [VncLink]: Found 0 remote updates +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: vnc_link finished after 0.004 seconds +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: curr_vmrss: 2015mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 205922 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:18Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=24152 blocks=1 instructions=101372 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=25579 blocks=1 instructions=104550 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.206 seconds +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2031mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 101372 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:18Z USER 9610 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.234 seconds +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2002mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 104550 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 0.242 seconds +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: curr_vmrss: 2002mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:41:18Z INFO 9610 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.041 seconds +2025-11-04T21:41:18Z INFO 9610 (sg00) [SubgraphForkPass]: curr_vmrss: 2003mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z INFO 9610 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 205922 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: subgraph_parallel_pass finished after 0.052 seconds +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: curr_vmrss: 2002mb, ru_maxrss: 2207mb (delta=0mb) +2025-11-04T21:41:18Z USER 9610 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:18Z INFO 9610 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z USER 9610 (nc00/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:18Z USER 9610 (nc01/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=24152 blocks=1 instructions=101372 Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=25579 blocks=1 instructions=104550 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:18Z INFO 9610 (nc01/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64238 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249505 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:18Z INFO 9610 (nc00/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64238 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249506 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 76974 │ +│ LDWEIGHTS │ 76974 │ +│ CAST │ 10734 │ +│ EVENT_SEMAPHORE │ 3670 │ +│ COPY │ 3080 │ +│ UNKNOWN(0xd4) │ 2300 │ +│ ACTIVATE │ 2232 │ +│ TENSOR_TENSOR │ 1329 │ +│ UNKNOWN(0xd8) │ 589 │ +│ PSEUDO_DMA_TRIGGER │ 565 │ +│ TENSOR_SCALAR │ 265 │ +│ MEMSET │ 257 │ +│ UNKNOWN(0xe8) │ 226 │ +│ ACT_TABLE_LOAD │ 141 │ +│ TENSOR_SCALAR_ADDR │ 113 │ +│ UNKNOWN(0xda) │ 68 │ +│ UNKNOWN(0xd9) │ 59 │ +│ TENSOR_REDUCE │ 58 │ +│ RECIPROCAL │ 57 │ +│ STREAM_SHUFFLE │ 24 │ +│ LOAD_MASK_SELECT │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 2 │ +│ IOTA │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 4718 │ +│ Scalar │ 13186 │ +│ Tensor │ 155551 │ +│ SyncDMA │ 0 │ +│ Vector │ 6247 │ +│ Sync │ 50 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [Codegen]: +┌─────────────────────┬───────┐ +│ Opcode │ Count │ +├─────────────────────┼───────┤ +│ MATMUL │ 77878 │ +│ LDWEIGHTS │ 77878 │ +│ CAST │ 10734 │ +│ EVENT_SEMAPHORE │ 4107 │ +│ COPY │ 3314 │ +│ UNKNOWN(0xd4) │ 2534 │ +│ ACTIVATE │ 2239 │ +│ TENSOR_TENSOR │ 1330 │ +│ TENSOR_SCALAR_ADDR │ 674 │ +│ PSEUDO_DMA_TRIGGER │ 650 │ +│ UNKNOWN(0xd8) │ 589 │ +│ IOTA │ 394 │ +│ UNKNOWN(0xda) │ 293 │ +│ MEMSET │ 269 │ +│ TENSOR_SCALAR │ 267 │ +│ POOL_BUFFER_LOAD │ 240 │ +│ GATHER │ 240 │ +│ UNKNOWN(0xe8) │ 226 │ +│ ACT_TABLE_LOAD │ 142 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ TENSOR_REDUCE │ 63 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 59 │ +│ LOAD_MASK_SELECT │ 25 │ +│ STREAM_SHUFFLE │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 4 │ +│ UNKNOWN(0xe5) │ 2 │ +│ STREAM_TRANSPOSE │ 1 │ +│ NOP │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 6882 │ +│ Scalar │ 13660 │ +│ Tensor │ 157457 │ +│ SyncDMA │ 0 │ +│ Vector │ 6681 │ +│ Sync │ 80 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:19Z USER 9610 (nc01/sg00) [Codegen]: isa_gen finished after 0.764 seconds +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 3072 │ +│ qDVESpillReload0 │ 2144 │ +│ qPoolSpillReload0 │ 39513 │ +│ qSPIO0 │ 33 │ +│ qSPSpillReload0 │ 68 │ +└───────────────────┴────────────────┘ + +Total descriptors: 44830 (0.000668019 GB) +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [Codegen]: Tensors with largest descriptor count: +┌─────────────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├─────────────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ _dot.4826-t47867_i1 │ Internal │ bfloat16 │ 1 │ +│ dot.110.54349 │ Internal │ float32 │ 1 │ +│ 38660.56469_i363 │ Internal │ float32 │ 1 │ +│ 38660.56469_i386 │ Internal │ float32 │ 1 │ +│ dot.110.54347 │ Internal │ float32 │ 1 │ +│ 38660.56469_i506 │ Internal │ float32 │ 1 │ +│ all_reduce.25-buffer-80495 │ Internal │ bfloat16 │ 1 │ +│ _reduce.6516-t57161_i0_remote_0 │ Internal │ float32 │ 1 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└─────────────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:19Z USER 9610 (nc01/sg00) [Codegen]: dma_desc_gen finished after 0.011 seconds +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:19Z USER 9610 (nc00/sg00) [Codegen]: isa_gen finished after 0.785 seconds +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 4612 │ +│ qDVESpillReload0 │ 1156 │ +│ qPoolSpillReload0 │ 50340 │ +│ qSPIO0 │ 51 │ +│ qSPSpillReload0 │ 202 │ +└───────────────────┴────────────────┘ + +Total descriptors: 56361 (0.000839844 GB) +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ custom_call.142 │ Internal │ float32 │ 2 │ +│ rng.1 │ Internal │ float32 │ 2 │ +│ get_tuple_element.4 │ Internal │ uint32 │ 2 │ +│ custom_call.143 │ Internal │ float32 │ 2 │ +│ input2 │ ExternalInput │ int32 │ 2 │ +│ get_tuple_element.5 │ Internal │ float32 │ 2 │ +│ broadcast_in_dim.17_i0 │ Internal │ int32 │ 2 │ +│ split_8 │ Internal │ int32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:19Z USER 9610 (nc00/sg00) [Codegen]: dma_desc_gen finished after 0.011 seconds +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:19Z WARNING 9610 (nc01/sg00) [Codegen]: Found 283 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:19Z USER 9610 (nc01/sg00) [Codegen]: debug_info_gen finished after 0.213 seconds +2025-11-04T21:41:19Z USER 9610 (nc01/sg00) [ModuleForkPass]: codegen finished after 1.022 seconds +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2243mb, ru_maxrss: 2243mb (delta=36mb) +2025-11-04T21:41:19Z WARNING 9610 (nc00/sg00) [Codegen]: Found 247 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:19Z USER 9610 (nc00/sg00) [Codegen]: debug_info_gen finished after 0.232 seconds +2025-11-04T21:41:19Z INFO 9610 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 24152 memory location(s), 1 block(s), and 101372 instruction(s). Max writers: 299 Max Readers: 21181 +2025-11-04T21:41:19Z USER 9610 (nc00/sg00) [ModuleForkPass]: codegen finished after 1.064 seconds +2025-11-04T21:41:19Z INFO 9610 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2194mb, ru_maxrss: 2243mb (delta=36mb) +2025-11-04T21:41:20Z INFO 9610 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 25579 memory location(s), 1 block(s), and 104550 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:20Z USER 9610 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:20Z USER 9610 [BackendPassManager]: mod_parallel_pass finished after 1.092 seconds +2025-11-04T21:41:20Z INFO 9610 [BackendPassManager]: curr_vmrss: 2024mb, ru_maxrss: 2243mb (delta=36mb) +2025-11-04T21:41:20Z USER 9610 [BackendPassManager]: Running hbm_usage +2025-11-04T21:41:20Z INFO 9610 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:20Z INFO 9610 (nc00/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 480.000B │ 122.156KB │ +│ CCE │ 0.000B │ 674.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 2.000KB │ 160.500KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:20Z INFO 9610 (nc00/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.663GB │ +│ Model Code │ 11.277MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.012MB │ +│ DMA Ring IO │ 2.469KB │ +│ DMA Ring Spill │ 957.328KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:20Z INFO 9610 (nc01/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 416.000B │ 110.594KB │ +│ CCE │ 0.000B │ 506.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 1.500KB │ 139.750KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:20Z INFO 9610 (nc01/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.662GB │ +│ Model Code │ 10.971MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.012MB │ +│ DMA Ring IO │ 1.906KB │ +│ DMA Ring Spill │ 757.016KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:20Z INFO 9610 [HBMUsage]: Total estimated HBM usage is: 3.680GB +2025-11-04T21:41:20Z USER 9610 [BackendPassManager]: hbm_usage finished after 0.010 seconds +2025-11-04T21:41:20Z INFO 9610 [BackendPassManager]: curr_vmrss: 1996mb, ru_maxrss: 2243mb (delta=0mb) +2025-11-04T21:41:20Z INFO 9610 [BackendPassManager]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 205922 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:20Z USER 9610 [BackendPassManager]: Running neff_packager +2025-11-04T21:41:20Z INFO 9610 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=2 allocs=49731 blocks=2 instructions=205922 Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:20Z WARNING 9610 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt/metrics.json +2025-11-04T21:41:20Z WARNING 9610 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:41:20Z INFO 9610 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff +2025-11-04T21:41:20Z INFO 9610 [NeffFileWriter]: IR signature: ca58e895e1bf63f7479ce4627d4038f6 for neff artifacts +2025-11-04T21:41:20Z USER 9610 [BackendPassManager]: neff_packager finished after 0.396 seconds +2025-11-04T21:41:20Z INFO 9610 [BackendPassManager]: curr_vmrss: 1997mb, ru_maxrss: 2243mb (delta=0mb) +2025-11-04T21:41:20Z INFO 9610 [BackendPassManager]: Output has 2 module(s), 2 function(s), 49731 memory location(s), 2 block(s), and 205922 instruction(s). Max writers: 299 Max Readers: 21965 +2025-11-04T21:41:20Z INFO 9610 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000076 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.005871 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000111 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.005882 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000076 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.005871 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000072 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000072 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000072 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.005871 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:41:20Z INFO 9610 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=local (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_10 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.003906 MB │ +│ split_4 │ uint8 │ 1 │ 0.003906 MB │ +│ split_6 │ uint8 │ 1 │ 0.003906 MB │ +│ split_8 │ int32 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:20Z INFO 9610 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.656 │ float32 │ 1 │ 2.320312 MB │ +│ all_reduce.111 │ bfloat16 │ 1 │ 0.031250 MB │ +│ get_tuple_element.1 │ float32 │ 1 │ 0.007812 MB │ +│ get_tuple_element.2 │ uint32 │ 1 │ 0.007812 MB │ +│ all_reduce.112 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:20Z INFO 9610 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg00, addr_space=local (complete data located at nc01/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_9 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.003906 MB │ +│ split_4 │ uint8 │ 1 │ 0.003906 MB │ +│ split_7 │ uint8 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:20Z INFO 9610 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:41:21Z INFO 8868 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:41:21Z INFO 8868 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:41:21Z INFO 8868 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt +2025-11-04T21:41:21Z INFO 8868 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:41:21Z INFO 8868 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:41:21Z INFO 8868 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:41:21Z INFO 8868 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:41:21Z INFO 8868 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:41:21Z INFO 8868 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt/hlo_netlist.json +2025-11-04T21:41:21Z INFO 8868 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk4/neuronxcc-zqk3xjbt/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:41:21Z INFO 8868 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:41:21Z INFO 8868 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:41:21Z INFO 8807 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk4/metaneff.pb b/token_generation_model/_tp0_bk4/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..f6d74635a239066d9b8bb9cef07f9f0102c5d159 --- /dev/null +++ b/token_generation_model/_tp0_bk4/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85a123eb95f38ce850a5acba0b3c90d88641822b27ea344a0ae85ba0c6e8b665 +size 3988817 diff --git a/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb b/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..4361b6b3bd7aac369eea3ca4a8455004843880f9 --- /dev/null +++ b/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae4190c0880e3c40978d782202617f80e971d9f98dfc92aee64d71ffd80bf477 +size 4075105 diff --git a/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff b/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff new file mode 100644 index 0000000000000000000000000000000000000000..93e5b5f83483e9053651ad4f12e1fcfbc490a2d0 --- /dev/null +++ b/token_generation_model/_tp0_bk4/model.MODULE_ec05e5a8222761962028+3b7d8ecf.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f876ca24c3808c5b51fa8c58cffc7004fbec6bbc8196380a31d0c971a6b88875 +size 8889344 diff --git a/token_generation_model/_tp0_bk4/neuron_config.json b/token_generation_model/_tp0_bk4/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ac89273da4f8dd00fb5ab0dc75c676b2be8c13bb --- /dev/null +++ b/token_generation_model/_tp0_bk4/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 8, + "bucket_n_active_tokens": false, + "buckets": [ + 2048 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": [ + 2048 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/token_generation_model/_tp0_bk5/command.txt b/token_generation_model/_tp0_bk5/command.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f8d19eee8f0b2c45e96239a0280b83b65a63509 --- /dev/null +++ b/token_generation_model/_tp0_bk5/command.txt @@ -0,0 +1 @@ +neuronx-cc compile --framework=XLA model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb --output model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=log-neuron-cc.txt --verbose=35 \ No newline at end of file diff --git a/token_generation_model/_tp0_bk5/compile_flags.MODULE_b0c5e51af4aeb4ea04b2+a0432539.json b/token_generation_model/_tp0_bk5/compile_flags.MODULE_b0c5e51af4aeb4ea04b2+a0432539.json new file mode 100644 index 0000000000000000000000000000000000000000..24512682cba1a7230c6a7b6a7972c7af8f113729 --- /dev/null +++ b/token_generation_model/_tp0_bk5/compile_flags.MODULE_b0c5e51af4aeb4ea04b2+a0432539.json @@ -0,0 +1 @@ +["--target=trn2", "--auto-cast=none", "--model-type=transformer", "--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma ", "--lnc=2", "-O2", "--internal-hlo2tensorizer-options=--verify-hlo=true", "--logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/log-neuron-cc.txt"] \ No newline at end of file diff --git a/token_generation_model/_tp0_bk5/global_metric_store.json b/token_generation_model/_tp0_bk5/global_metric_store.json new file mode 100644 index 0000000000000000000000000000000000000000..da3280429de9c5038eae39ffa023b0a218eaea5b --- /dev/null +++ b/token_generation_model/_tp0_bk5/global_metric_store.json @@ -0,0 +1,590 @@ +{ + "Average": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 98.94332885742188, + "StaticProfiler::AveragePartitionUtilization": 86.24988555908203, + "StaticProfiler::AveragePeUtilization": 68.06179809570313, + "StaticProfiler::LocalizationEfficiency": 158.05445861816406, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 160.4937286376953, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0 + } + }, + "Count": { + "tensorizer": { + "StaticProfiler::AverageFractalPeUtilization": 1, + "StaticProfiler::AveragePartitionUtilization": 1, + "StaticProfiler::AveragePeUtilization": 1, + "StaticProfiler::LocalizationEfficiency": 1, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 1, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 1, + "TilingProfiler::AveragePeUtilizationAfterTiling": 1 + } + }, + "Sum": { + "compiletime": { + "AGOrderingAnalysisPass": 2.577322006225586, + "AffinePredicateResolution": 0.09284520149230957, + "AliasDependencyElimination": 0.004574298858642578, + "AliasDependencyInduction": 0.813002347946167, + "AliasDependencyReset": 0.8516831398010254, + "BFComputeCutting": 0.4163675308227539, + "BirCodeGenLoop": 2.0439093112945557, + "CCOpFusion": 1.2363176345825195, + "CanonicalizeConv": 0.0, + "CanonicalizeDAGForPGTiling": 0.1793205738067627, + "CanonicalizeForTensorizer": 0.00041000000783242285, + "CanonicalizeIR": 0.10800528526306152, + "Canonicalizer": 0.009233999997377396, + "CoalesceCCOp": 0.17753362655639648, + "CommuteConcat": 0.03344416618347168, + "DMALocalityOpt": 0.044352054595947266, + "DMAProfiler": 0.08528518676757813, + "DMATilingProfiler": 0.09637188911437988, + "DataLocalityOpt": 3.2345950603485107, + "DataStreaming": 0.17251181602478027, + "DeConcat": 0.06256246566772461, + "DeadCodeElimination": 0.03286480903625488, + "DeadStoreElimination": 1.1570920944213867, + "DelinearIndices": 0.4294607639312744, + "Delinearization": 0.16837501525878906, + "DelinearizeSPMD": 0.21433115005493164, + "DoNothing": 0.0009343624114990234, + "DramToDramTranspose": 0.341036319732666, + "DumpGraphAndMetadata": 0.14029169082641602, + "EliminateDivs": 0.23865985870361328, + "ExpandBatchNorm": 0.11588501930236816, + "ExpandISAMacro": 0.09187030792236328, + "FactorizeBlkDims": 0.612274169921875, + "FactorizeThreadAxesInFreeDims": 0.0903313159942627, + "FlattenMacroLoop": 0.10088062286376953, + "GenericAccessSimplifier": 0.02825760841369629, + "HoistCompute": 8.399999933317304e-05, + "IdentifyCrossPassTensors": 0.00015100000018719584, + "InferInitValue": 1.7035224437713623, + "InferIntrinsicOnCC": 0.3380553722381592, + "InferNeuronTensor": 1.7164275646209717, + "InferNonlocalTensors": 5.598329544067383, + "InferPSumTensor": 1.4246575832366943, + "InferShardAxis": 9.271984100341797, + "InferSharedMemLoc": 0.12148380279541016, + "InlineNativeKernels": 0.057215213775634766, + "InsertCoreBarrier": 0.1337273120880127, + "InsertIOTransposes": 0.876704216003418, + "InsertImplicitShardAxisBeforeISel": 0.41373419761657715, + "InsertLocalTransposes": 0.7329738140106201, + "InsertOffloadedTransposes": 0.13259291648864746, + "LICM": 0.12085676193237305, + "LateLegalizeInst": 0.15541791915893555, + "LateLegalizePostSplit": 0.1052546501159668, + "LateLowerReshapeOp": 0.03813338279724121, + "LateLowerTensorOp": 0.3610198497772217, + "LateNeuronInstComb": 1.1589789390563965, + "LayoutPreprocessing": 0.9314424991607666, + "LayoutPreprocessingAndAnalysis": 1.3875887393951416, + "LayoutRequirementAnalysis": 0.4473288059234619, + "LegalizeCCOpLayout": 0.13658738136291504, + "LegalizeOpLevelAlias": 0.05193901062011719, + "LegalizePartitionReduce": 0.09119105339050293, + "LegalizeSundaAccess": 0.9767770767211914, + "LegalizeSundaMacro": 0.6998641490936279, + "LegalizeType": 0.1455233097076416, + "LocalLayoutOpt": 0.6927175521850586, + "LoopFusion": 0.38257908821105957, + "LoopSplitting": 0.031409502029418945, + "LowerBroadcast": 0.5477209091186523, + "LowerCCOpBlockAxis": 0.2258608341217041, + "LowerComplexBroadcast": 0.08743643760681152, + "LowerIntrinsics": 0.9349169731140137, + "LowerShardAxis": 0.23558402061462402, + "LowerTensorOp": 0.6309223175048828, + "LowerToSendRecv": 0.16785597801208496, + "LowerTranspose": 0.4913480281829834, + "MacroGeneration": 2.3815677165985107, + "MaskPropagation": 0.11515140533447266, + "MemcastMotion": 0.00013299999409355223, + "MemcpyElimination": 9.188785552978516, + "MutateDataType": 0.04154014587402344, + "NeuronAliasDependencyInduction": 0.02184748649597168, + "NeuronAliasDependencyReset": 0.03938126564025879, + "NeuronInstComb": 0.40968942642211914, + "NeuronLICM": 0.2851901054382324, + "NeuronLoopFusion": 1.5726032257080078, + "NeuronLoopInterchange": 0.0657188892364502, + "NeuronSimplifier": 0.5047965049743652, + "NeuronSimplifyPredicates": 0.2825462818145752, + "NeuronValueNumbering": 0.12901020050048828, + "OptimizeAliasedCopyChain": 0.020284414291381836, + "OptimizeNKIKernels": 1.7541263103485107, + "PAGLayoutOpt": 15.631235122680664, + "PComputeCutting": 0.3185763359069824, + "PGLayoutTilingPipeline": 41.048789978027344, + "PGTiling": 6.015847206115723, + "PadElimination": 0.021477460861206055, + "ParAxesAnnotation": 14.886741638183594, + "PartialLoopFusion": 1.2531118392944336, + "PartialSimdFusion": 0.8362796306610107, + "PenguinizeFunctions": 0.00019799999427050352, + "PerfectLoopNest": 0.07204103469848633, + "PruneFunctions": 0.0004140000091865659, + "RecognizeOpIdiom": 0.12753748893737793, + "Recompute": 0.00962972640991211, + "RelaxPredicates": 0.11856865882873535, + "Rematerialization": 0.21219396591186523, + "RemoveOptimizationBarriers": 6.900000153109431e-05, + "RemoveShardedPartitionAxes": 1.250422716140747, + "ReshapeWeights": 0.03184843063354492, + "ResolveAccessConflict": 0.19439220428466797, + "ResolveComplicatePredicates": 0.07511138916015625, + "RewriteReplicationMatmul": 0.04624319076538086, + "RewriteWeights": 0.08813834190368652, + "SFKVectorizer": 6.741949081420898, + "ScatterMotion": 0.003625999903306365, + "ShardingPropagationAnalysis": 0.7337610721588135, + "SimpleAllReduceTiling": 0.06987547874450684, + "Simplifier": 0.10178971290588379, + "SimplifyMacroPredicates": 0.28119897842407227, + "SimplifyNeuronTensor": 0.4106020927429199, + "SimplifySlice": 0.03202533721923828, + "SimplifyTensor": 0.27721595764160156, + "SpillPSum": 0.7228884696960449, + "SplitAPUnionSets": 0.5242888927459717, + "SplitAccGrp": 0.049216270446777344, + "StaticProfiler": 0.13759756088256836, + "StaticTransposeLocalTensor": 0.24800348281860352, + "SundaISel": 1.4039628505706787, + "TCTransform": 0.03422880172729492, + "TensorInitialization": 0.22466588020324707, + "TensorOpSimplifier": 0.699378490447998, + "TensorOpTransform": 2.3107619285583496, + "TensorizerLegalizationPass": 0.00019799999427050352, + "TileCCOps": 0.18677926063537598, + "TilingProfiler": 0.47794580459594727, + "TransformConvOp": 0.16123557090759277, + "TritiumFusion": 1.6316783428192139, + "ValueNumbering": 0.10072684288024902, + "VectorizeDMA": 0.6808476448059082, + "VectorizeMatMult": 0.08075761795043945, + "VerifySupportedOps": 0.0003100000030826777, + "WeightCoalescing": 0.06566977500915527, + "ZeroSizeTensorElimination": 0.00043392181396484375, + "algsimp": 0.005183999892324209, + "batchnorm_expander": 0.0011320000048726797, + "boundary-marker-removal": 0.0006000000284984708, + "call-inliner": 0.00027799999224953353, + "canonicalize-boundary-marker": 0.0006880000000819564, + "collective-stream-id-checker": 9.40000027185306e-05, + "comparison-expander": 0.0006220000213943422, + "computation-deduplicator": 0.000534999999217689, + "config-lowering": 0.0007169999880716205, + "constant_folding": 0.00022499999613501132, + "cse": 0.000783999974373728, + "dce": 5.199999941396527e-05, + "dynamic-slice-transpose": 0.0002229999954579398, + "eliminate-redundant-compare": 0.00020399999630171806, + "emit-offloaded-dropout": 0.00029200001154094934, + "flatten-call-graph": 0.0005019999807700515, + "fuse-send-recv": 0.0025220001116394997, + "hilo-conditional-to-select": 0.00012399999832268804, + "hilo::LegalizeAlias": 0.0030360000673681498, + "hilo::NeuronInstCombine": 0.0013549999566748738, + "hilo::NeuronOpFusion": 0.0002460000105202198, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00023499999952036887, + "hilo::ScheduleFusion": 4.199999966658652e-05, + "hilo::SixtyFourHack": 0.000311999989207834, + "hilo::VerifyAliasing": 0.00010900000052060932, + "hlo-mac-count": 0.005384000018239021, + "io-con-pipe-begin": 7.999999979801942e-06, + "io-con-pipe-end": 1.9999999949504854e-06, + "io-layout-normalization": 0.0015450000064447522, + "legalize-ccops-for-tensorizer": 2.5999999706982635e-05, + "legalize-compare": 0.0005339999916031957, + "lower-argminmax-custom-call": 0.00018699999782256782, + "map-inline": 0.0009280000231228769, + "metadata-naming": 0.0017320000333711505, + "mlir::detail::OpToOpPassAdaptor": 0.00020300000323913991, + "mlir::hlo::MhloToPyPenguin": 0.08361499756574631, + "mlir::mhlo::LowerComplexExtraPass": 0.002681999932974577, + "mlir::mhlo::LowerComplexPass": 0.004726999904960394, + "native-to-custom-softmax": 0.0006019999855197966, + "native-to-custom-softmax-dx": 0.0006000000284984708, + "neuron-hlo-verifier": 0.024771999567747116, + "operand_upcaster": 0.0010130000300705433, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06845100224018097, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0003169999981764704, + "reshape-mover": 0.00010699999984353781, + "simplify-concat": 0.005785999819636345, + "simplify-while-loops": 7.000000186963007e-05, + "transform-variadic-reduce": 0.0009619999909773469, + "tuple-simplifier": 0.00022000000171829015, + "unpack-nested-aws-ntwsr": 0.0005869999877177179, + "unroll-while-loop": 9.999999747378752e-06 + }, + "hilo": { + "HloMacCount": 8777859072.0, + "Traffic": 3915494144.0 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 254484, + "StaticProfiler::AifUb": 31.318378448486328, + "StaticProfiler::ArithmeticIntensityTensorizer": 49.50009536743164, + "StaticProfiler::AverageDmaLength": 4651.40234375, + "StaticProfiler::DDRTransferBytes": 3663457236, + "StaticProfiler::InternalTransferBytes": 1304767680, + "StaticProfiler::LoadExpanded": 636198, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 644547, + "StaticProfiler::TotalDynamicInstancesCount": 287250, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 214600, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 146593, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 71005, + "TilingProfiler::PfTransposeInstructionsForIo": 66880, + "TilingProfiler::PfTransposeInstructionsForLocal": 2163, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 1125, + "TilingProfiler::SimdInstructionsAfterTiling": 5176, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "all": { + "compiletime": { + "CanonicalizeConv": 0.0, + "CanonicalizeForTensorizer": 0.00041000000783242285, + "Canonicalizer": 0.009233999997377396, + "HoistCompute": 8.399999933317304e-05, + "IdentifyCrossPassTensors": 0.00015100000018719584, + "MemcastMotion": 0.00013299999409355223, + "PenguinizeFunctions": 0.00019799999427050352, + "PruneFunctions": 0.0004140000091865659, + "RemoveOptimizationBarriers": 6.900000153109431e-05, + "ScatterMotion": 0.003625999903306365, + "TensorizerLegalizationPass": 0.00019799999427050352, + "VerifySupportedOps": 0.0003100000030826777, + "algsimp": 0.005183999892324209, + "batchnorm_expander": 0.0011320000048726797, + "boundary-marker-removal": 0.0006000000284984708, + "call-inliner": 0.00027799999224953353, + "canonicalize-boundary-marker": 0.0006880000000819564, + "collective-stream-id-checker": 9.40000027185306e-05, + "comparison-expander": 0.0006220000213943422, + "computation-deduplicator": 0.000534999999217689, + "config-lowering": 0.0007169999880716205, + "constant_folding": 0.00022499999613501132, + "cse": 0.000783999974373728, + "dce": 5.199999941396527e-05, + "dynamic-slice-transpose": 0.0002229999954579398, + "eliminate-redundant-compare": 0.00020399999630171806, + "emit-offloaded-dropout": 0.00029200001154094934, + "flatten-call-graph": 0.0005019999807700515, + "fuse-send-recv": 0.0025220001116394997, + "hilo-conditional-to-select": 0.00012399999832268804, + "hilo::LegalizeAlias": 0.0030360000673681498, + "hilo::NeuronInstCombine": 0.0013549999566748738, + "hilo::NeuronOpFusion": 0.0002460000105202198, + "hilo::ReplaceTokenTypeWithU8Pass": 0.00023499999952036887, + "hilo::ScheduleFusion": 4.199999966658652e-05, + "hilo::SixtyFourHack": 0.000311999989207834, + "hilo::VerifyAliasing": 0.00010900000052060932, + "hlo-mac-count": 0.005384000018239021, + "io-con-pipe-begin": 7.999999979801942e-06, + "io-con-pipe-end": 1.9999999949504854e-06, + "io-layout-normalization": 0.0015450000064447522, + "legalize-ccops-for-tensorizer": 2.5999999706982635e-05, + "legalize-compare": 0.0005339999916031957, + "lower-argminmax-custom-call": 0.00018699999782256782, + "map-inline": 0.0009280000231228769, + "metadata-naming": 0.0017320000333711505, + "mlir::detail::OpToOpPassAdaptor": 0.00020300000323913991, + "mlir::hlo::MhloToPyPenguin": 0.08361499756574631, + "mlir::mhlo::LowerComplexExtraPass": 0.002681999932974577, + "mlir::mhlo::LowerComplexPass": 0.004726999904960394, + "native-to-custom-softmax": 0.0006019999855197966, + "native-to-custom-softmax-dx": 0.0006000000284984708, + "neuron-hlo-verifier": 0.024771999567747116, + "operand_upcaster": 0.0010130000300705433, + "post-par-pipe-begin": 9.999999974752427e-07, + "post-par-pipe-end": 0.0, + "post-partition-simplification": 0.06845100224018097, + "pre-hlo-begin": 6.000000212225132e-06, + "pre-hlo-end": 9.999999974752427e-07, + "replace-minimum-constant": 0.0003169999981764704, + "reshape-mover": 0.00010699999984353781, + "simplify-concat": 0.005785999819636345, + "simplify-while-loops": 7.000000186963007e-05, + "transform-variadic-reduce": 0.0009619999909773469, + "tuple-simplifier": 0.00022000000171829015, + "unpack-nested-aws-ntwsr": 0.0005869999877177179, + "unroll-while-loop": 9.999999747378752e-06 + } + }, + "cumsum": { + "compiletime": { + "CoalesceCCOp": 0.0002605915069580078, + "DMALocalityOpt": 0.00019121170043945313, + "DMAProfiler": 0.0007300376892089844, + "DataStreaming": 0.0002942085266113281, + "DoNothing": 0.0007104873657226563, + "ExpandISAMacro": 0.0005214214324951172, + "FactorizeBlkDims": 0.0004582405090332031, + "InferPSumTensor": 0.0006060600280761719, + "InferSharedMemLoc": 0.0003056526184082031, + "InsertCoreBarrier": 0.00026702880859375, + "LateLegalizeInst": 0.0003993511199951172, + "LateNeuronInstComb": 0.0006673336029052734, + "LegalizeSundaAccess": 0.0015358924865722656, + "LegalizeType": 0.0002582073211669922, + "LowerBroadcast": 0.0002548694610595703, + "LowerIntrinsics": 0.0002319812774658203, + "LowerTranspose": 0.0002675056457519531, + "NeuronInstComb": 0.0007252693176269531, + "NeuronLICM": 0.0003943443298339844, + "NeuronSimplifyPredicates": 0.0022835731506347656, + "NeuronValueNumbering": 0.0004334449768066406, + "SFKVectorizer": 0.002683401107788086, + "SimpleAllReduceTiling": 0.0002601146697998047, + "SimplifyNeuronTensor": 0.0005571842193603516, + "SpillPSum": 0.0006034374237060547, + "WeightCoalescing": 0.00022268295288085938 + } + }, + "sg00": { + "hilo": { + "ArithmeticIntensity": 4.483653545379639, + "HloMacCount": 8777859072.0, + "Traffic": 3915494144.0 + } + }, + "sg0000": { + "compiletime": { + "AGOrderingAnalysisPass": 2.577322006225586, + "AffinePredicateResolution": 0.09284520149230957, + "AliasDependencyElimination": 0.004574298858642578, + "AliasDependencyInduction": 0.813002347946167, + "AliasDependencyReset": 0.8516831398010254, + "BFComputeCutting": 0.4163675308227539, + "BirCodeGenLoop": 2.0439093112945557, + "CCOpFusion": 1.2363176345825195, + "CanonicalizeDAGForPGTiling": 0.1793205738067627, + "CanonicalizeIR": 0.10800528526306152, + "CoalesceCCOp": 0.17473959922790527, + "CommuteConcat": 0.03344416618347168, + "DMALocalityOpt": 0.04196333885192871, + "DMAProfiler": 0.08075666427612305, + "DMATilingProfiler": 0.09637188911437988, + "DataLocalityOpt": 3.2345950603485107, + "DataStreaming": 0.16798925399780273, + "DeConcat": 0.06256246566772461, + "DeadCodeElimination": 0.03286480903625488, + "DeadStoreElimination": 1.1570920944213867, + "DelinearIndices": 0.4294607639312744, + "Delinearization": 0.16837501525878906, + "DelinearizeSPMD": 0.21433115005493164, + "DoNothing": 6.389617919921875e-05, + "DramToDramTranspose": 0.341036319732666, + "DumpGraphAndMetadata": 0.14029169082641602, + "EliminateDivs": 0.23865985870361328, + "ExpandBatchNorm": 0.11588501930236816, + "ExpandISAMacro": 0.08861160278320313, + "FactorizeBlkDims": 0.6043739318847656, + "FactorizeThreadAxesInFreeDims": 0.0903313159942627, + "FlattenMacroLoop": 0.10088062286376953, + "GenericAccessSimplifier": 0.02825760841369629, + "InferInitValue": 1.7035224437713623, + "InferIntrinsicOnCC": 0.3380553722381592, + "InferNeuronTensor": 1.7164275646209717, + "InferNonlocalTensors": 5.598329544067383, + "InferPSumTensor": 1.4166266918182373, + "InferShardAxis": 9.271984100341797, + "InferSharedMemLoc": 0.11923456192016602, + "InlineNativeKernels": 0.057215213775634766, + "InsertCoreBarrier": 0.13120818138122559, + "InsertIOTransposes": 0.876704216003418, + "InsertImplicitShardAxisBeforeISel": 0.41373419761657715, + "InsertLocalTransposes": 0.7329738140106201, + "InsertOffloadedTransposes": 0.13259291648864746, + "LICM": 0.12085676193237305, + "LateLegalizeInst": 0.14969396591186523, + "LateLegalizePostSplit": 0.1052546501159668, + "LateLowerReshapeOp": 0.03813338279724121, + "LateLowerTensorOp": 0.3610198497772217, + "LateNeuronInstComb": 1.1523239612579346, + "LayoutPreprocessing": 0.9314424991607666, + "LayoutPreprocessingAndAnalysis": 1.3875887393951416, + "LayoutRequirementAnalysis": 0.4473288059234619, + "LegalizeCCOpLayout": 0.13658738136291504, + "LegalizeOpLevelAlias": 0.05193901062011719, + "LegalizePartitionReduce": 0.09119105339050293, + "LegalizeSundaAccess": 0.966181755065918, + "LegalizeSundaMacro": 0.6998641490936279, + "LegalizeType": 0.1395578384399414, + "LocalLayoutOpt": 0.6927175521850586, + "LoopFusion": 0.38257908821105957, + "LoopSplitting": 0.031409502029418945, + "LowerBroadcast": 0.5450491905212402, + "LowerCCOpBlockAxis": 0.2258608341217041, + "LowerComplexBroadcast": 0.08743643760681152, + "LowerIntrinsics": 0.9320821762084961, + "LowerShardAxis": 0.23558402061462402, + "LowerTensorOp": 0.6309223175048828, + "LowerToSendRecv": 0.16785597801208496, + "LowerTranspose": 0.48870348930358887, + "MacroGeneration": 2.3815677165985107, + "MaskPropagation": 0.11515140533447266, + "MemcpyElimination": 9.188785552978516, + "MutateDataType": 0.04154014587402344, + "NeuronAliasDependencyInduction": 0.02184748649597168, + "NeuronAliasDependencyReset": 0.03938126564025879, + "NeuronInstComb": 0.4020655155181885, + "NeuronLICM": 0.27831411361694336, + "NeuronLoopFusion": 1.5726032257080078, + "NeuronLoopInterchange": 0.0657188892364502, + "NeuronSimplifier": 0.5047965049743652, + "NeuronSimplifyPredicates": 0.27771592140197754, + "NeuronValueNumbering": 0.1258838176727295, + "OptimizeAliasedCopyChain": 0.020284414291381836, + "OptimizeNKIKernels": 1.7541263103485107, + "PAGLayoutOpt": 15.631235122680664, + "PComputeCutting": 0.3185763359069824, + "PGLayoutTilingPipeline": 41.048789978027344, + "PGTiling": 6.015847206115723, + "PadElimination": 0.021477460861206055, + "ParAxesAnnotation": 14.886741638183594, + "PartialLoopFusion": 1.2531118392944336, + "PartialSimdFusion": 0.8362796306610107, + "PerfectLoopNest": 0.07204103469848633, + "RecognizeOpIdiom": 0.12753748893737793, + "Recompute": 0.00962972640991211, + "RelaxPredicates": 0.11856865882873535, + "Rematerialization": 0.21219396591186523, + "RemoveShardedPartitionAxes": 1.250422716140747, + "ReshapeWeights": 0.03184843063354492, + "ResolveAccessConflict": 0.19439220428466797, + "ResolveComplicatePredicates": 0.07511138916015625, + "RewriteReplicationMatmul": 0.04624319076538086, + "RewriteWeights": 0.08813834190368652, + "SFKVectorizer": 6.71961784362793, + "ShardingPropagationAnalysis": 0.7337610721588135, + "SimpleAllReduceTiling": 0.06714820861816406, + "Simplifier": 0.10178971290588379, + "SimplifyMacroPredicates": 0.28119897842407227, + "SimplifyNeuronTensor": 0.36328887939453125, + "SimplifySlice": 0.03202533721923828, + "SimplifyTensor": 0.27721595764160156, + "SpillPSum": 0.7098052501678467, + "SplitAPUnionSets": 0.5242888927459717, + "SplitAccGrp": 0.049216270446777344, + "StaticProfiler": 0.13759756088256836, + "StaticTransposeLocalTensor": 0.24800348281860352, + "SundaISel": 1.4039628505706787, + "TCTransform": 0.03422880172729492, + "TensorInitialization": 0.22466588020324707, + "TensorOpSimplifier": 0.699378490447998, + "TensorOpTransform": 2.3107619285583496, + "TileCCOps": 0.18677926063537598, + "TilingProfiler": 0.47794580459594727, + "TransformConvOp": 0.16123557090759277, + "TritiumFusion": 1.6316783428192139, + "ValueNumbering": 0.10072684288024902, + "VectorizeDMA": 0.6808476448059082, + "VectorizeMatMult": 0.08075761795043945, + "WeightCoalescing": 0.06300926208496094, + "ZeroSizeTensorElimination": 0.00043392181396484375 + }, + "tensorizer": { + "DMATilingProfiler::TotalInstructionsAfterTiling": 254484, + "StaticProfiler::AifUb": 31.318378448486328, + "StaticProfiler::ArithmeticIntensityTensorizer": 49.50009536743164, + "StaticProfiler::AverageDmaLength": 4651.40234375, + "StaticProfiler::AverageFractalPeUtilization": 98.94332885742188, + "StaticProfiler::AveragePartitionUtilization": 86.24988555908203, + "StaticProfiler::AveragePeUtilization": 68.06179809570313, + "StaticProfiler::DDRTransferBytes": 3663457236, + "StaticProfiler::InternalTransferBytes": 1304767680, + "StaticProfiler::LoadExpanded": 636198, + "StaticProfiler::LocalizationEfficiency": 158.05445861816406, + "StaticProfiler::LocalizationEfficiencyIgnoreNonlocal": 160.4937286376953, + "StaticProfiler::StoreExpanded": 8349, + "StaticProfiler::TotalDMAExpanded": 644547, + "StaticProfiler::TotalDynamicInstancesCount": 287250, + "StaticProfiler::TotalDynamicInstancesWithMmPackedCount": 214600, + "StaticProfiler::TotalLNCComm": 0, + "StaticProfiler::TotalLNCCommTransfer": 0, + "TilingProfiler::AveragePartitionUtilizationAfterTiling": 0, + "TilingProfiler::AveragePeUtilizationAfterTiling": 0, + "TilingProfiler::BatchnormInstructionsAfterTiling": 0, + "TilingProfiler::DmaInstructionsAfterTiling": 0, + "TilingProfiler::GenericInstructionsAfterTiling": 243, + "TilingProfiler::MatMultInstructionsAfterTiling": 146593, + "TilingProfiler::NumPfTransposes": 294, + "TilingProfiler::NumPfTransposesForIo": 30, + "TilingProfiler::NumPfTransposesForLocal": 144, + "TilingProfiler::NumPfTransposesForNonlocal": 120, + "TilingProfiler::PfTransposeInstructions": 71005, + "TilingProfiler::PfTransposeInstructionsForIo": 66880, + "TilingProfiler::PfTransposeInstructionsForLocal": 2163, + "TilingProfiler::PfTransposeInstructionsForNonlocal": 1962, + "TilingProfiler::ReduceInstructionsAfterTiling": 1125, + "TilingProfiler::SimdInstructionsAfterTiling": 5176, + "TilingProfiler::TotalInstructionsAfterTiling": 0, + "TransformConvOp::Conv1d_depthwise_bf01_oi01_bf01": 0, + "TransformConvOp::Conv2d_dw_fb01_io01_01bf_rep_nhwc_Pcinh": 0, + "TransformConvOp::Conv2d_pbp_0f1b_0i1o_01fb_experimental_1": 0, + "TransformConvOp::Conv2d_pbp_fb01_io01_01bf_experimental_1": 0, + "TransformConvOp::conv2d_column_packing": 0, + "TransformConvOp::conv2d_column_packing_1": 0, + "TransformConvOp::conv2d_column_packing_io10": 0, + "TransformConvOp::conv2d_depthwise_f01b_o01i_bf01": 0 + } + }, + "topk": { + "compiletime": { + "CoalesceCCOp": 0.002533435821533203, + "DMALocalityOpt": 0.0021975040435791016, + "DMAProfiler": 0.0037984848022460938, + "DataStreaming": 0.004228353500366211, + "DoNothing": 0.00015997886657714844, + "ExpandISAMacro": 0.002737283706665039, + "FactorizeBlkDims": 0.007441997528076172, + "InferPSumTensor": 0.007424831390380859, + "InferSharedMemLoc": 0.0019435882568359375, + "InsertCoreBarrier": 0.0022521018981933594, + "LateLegalizeInst": 0.005324602127075195, + "LateNeuronInstComb": 0.005987644195556641, + "LegalizeSundaAccess": 0.009059429168701172, + "LegalizeType": 0.005707263946533203, + "LowerBroadcast": 0.002416849136352539, + "LowerIntrinsics": 0.002602815628051758, + "LowerTranspose": 0.002377033233642578, + "NeuronInstComb": 0.006898641586303711, + "NeuronLICM": 0.006481647491455078, + "NeuronSimplifyPredicates": 0.0025467872619628906, + "NeuronValueNumbering": 0.0026929378509521484, + "SFKVectorizer": 0.01964735984802246, + "SimpleAllReduceTiling": 0.0024671554565429688, + "SimplifyNeuronTensor": 0.04675602912902832, + "SpillPSum": 0.012479782104492188, + "WeightCoalescing": 0.0024378299713134766 + } + } +} \ No newline at end of file diff --git a/token_generation_model/_tp0_bk5/graph.neff b/token_generation_model/_tp0_bk5/graph.neff new file mode 100644 index 0000000000000000000000000000000000000000..e0a147185604e99a58fe43296521e09f9cd2c4b0 --- /dev/null +++ b/token_generation_model/_tp0_bk5/graph.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8fd79ce1a7d4475261bf89398504b1054163d9dc97c945814252ecd96994960 +size 11981824 diff --git a/token_generation_model/_tp0_bk5/log-neuron-cc.txt b/token_generation_model/_tp0_bk5/log-neuron-cc.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef998143a799b7e1a4f088ac6f71296d197ceaab --- /dev/null +++ b/token_generation_model/_tp0_bk5/log-neuron-cc.txt @@ -0,0 +1,4621 @@ +2025-11-04T21:38:36Z INFO 8808 [root]: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework=XLA /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff --target=trn2 --auto-cast=none --model-type=transformer '--tensorizer-options=--enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma' --lnc=2 -O2 --internal-hlo2tensorizer-options=--verify-hlo=true --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/log-neuron-cc.txt --verbose=35 +2025-11-04T21:38:36Z INFO 8808 [root]: NeuronX Compiler version 2.21.33363.0+82129205 Python version 3.10.12 HWM version 2.21.0.33363+82129205 NumPy version 1.26.4 Running on AMI ami-00632e4ca97ea8199 Running in region usw2-az2 +2025-11-04T21:38:36Z INFO 8881 [root]: XLA detected +2025-11-04T21:38:36Z INFO 8881 [root]: Pipeline: HLOToTensorizer Frontend StaticIOTranspose WalrusDriver BIRLinker Kelper NeffWrapper +2025-11-04T21:38:36Z INFO 8881 [root]: Intermediate files stored in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704, output in /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5 +2025-11-04T21:38:36Z INFO 8881 [pipeline.Pipeline.0]: Job Pipeline len(in_states) 1 +2025-11-04T21:38:36Z INFO 8881 [pipeline.Pipeline.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8881 [pipeline.Pipeline.0]: Running pipeline Pipeline.0 +2025-11-04T21:38:36Z INFO 8881 [pipeline.Pipeline.0]: Starting job job.HLOToTensorizer.0 +2025-11-04T21:38:36Z INFO 8881 [job.HLOToTensorizer.0]: Job HLOToTensorizer len(in_states) 1 +2025-11-04T21:38:36Z INFO 8881 [job.HLOToTensorizer.0]: Processing input #0 +2025-11-04T21:38:36Z INFO 8881 [job.HLOToTensorizer.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo2penguin --input /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb --out-dir ./ --output penguin.py --remat --max-costly-ops=2 --max-live-in-size=5 --max-remat-chain-size=10 --max-mem-multiple=1.8 --min-def-use-distance=500 --remat-policy=transformer --allow-same-pass-remat=true --verbose=error --logfile=/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/log-neuron-cc.txt --logfile-verbose=info --layers-per-module=1 --emit-tensor-level-dropout-ops --verify-hlo=true --native-to-custom-softmax --partitioner-opts='--transformer' +2025-11-04T21:38:37Z INFO 8881 [job.HLOToTensorizer.0]: Replaced 0 dropout sequences with OffloadedDropout +HLO Ops used in computation: add all-gather all-reduce broadcast compare concatenate constant convert cosine custom-call divide dot exponential gather get-tuple-element iota maximum multiply negate parameter reduce reshape rng scatter select sine slice subtract transpose tuple +2025-11-04 21:38:37.081141: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:88] Could not open file debug_info_hlo_partitions.json +2025-11-04 21:38:37.089322: W hilo/hlo2penguin/utils/DumpDebugInfo.cc:52] Truncating long HLO operator name %tuple.10701 = tuple(%reshape.4385, %scatter.9929, %scatter.9944, %scatter.9957, %scatter.9972, %scatter.9985, %scatter.10000, %scatter.10013, %scatter.10028, %scatter.10041, %scatter.10056, %scatter.10069, %scatter.10084, %scatter.10097, %scatter.10112, %scatter.10125, %scatter.10140, %scatter.10153, %scatter.10168, %scatter.10181, %scatter.10196, %scatter.10209, %scatter.10224, %scatter.10237, %scatter.10252, %scatter.10265, %scatter.10280, %scatter.10293, %scatter.10308, %scatter.10321, %scatter.10336... to 512 characters in the compiler's debug metadata +Invoking RemoveOptimizationBarriers pass + +2025-11-04T21:38:37Z INFO 8881 [job.HLOToTensorizer.0]: IR signature: 13bc1c8fbc10e0af0c938a136d15a5945d42f00acd0c54e317bd266124e65ef1 for sg0000/HLOToTensorizer +2025-11-04T21:38:37Z INFO 8881 [job.HLOToTensorizer.0]: Job #0 finished +2025-11-04T21:38:37Z INFO 8881 [pipeline.Pipeline.0]: Finished job job.HLOToTensorizer.0 +2025-11-04T21:38:37Z INFO 8881 [pipeline.Pipeline.0]: Starting job job.Frontend.0 +2025-11-04T21:38:37Z INFO 8881 [job.Frontend.0]: Job Frontend len(in_states) 1 +2025-11-04T21:38:37Z INFO 8881 [job.Frontend.0]: Processing input #0 +2025-11-04T21:38:37Z INFO 8881 [job.Frontend.0]: Start model loading +2025-11-04T21:38:37Z INFO 8881 [job.Frontend.0]: Start tensorization +2025-11-04T21:38:37Z INFO 8881 [job.Frontend.0]: Num jobs: 1 +2025-11-04T21:38:37Z USER 8881 [root/Tensorizer/Tensorizer]: Running Tensorizer +2025-11-04T21:38:37Z INFO 8881 [Tensorizer]: Frontend did not find netlist info. Switching to flat flow. +2025-11-04T21:38:37Z INFO 8881 [Tensorizer]: Building model from Penguin script "penguin.py"... +2025-11-04T21:38:39Z INFO 8881 [Tensorizer]: Tensorizer options: --enable-ccop-compute-overlap --cc-pipeline-tiling-factor=1 --vectorize-strided-dma --run-pg-layout-and-tiling --enable-dse-after-mask-propagation --disable-concat-delinearizer --num-neuroncores-per-sengine=2 --num-neuroncores-per-sengine=2 --internal_dynamic_dma_scratch_size_per_partition=16384 --disable-bitcasted-transpose --dont-verify-after-all --fp32-cast=none --mm-transpose-type=fp32 --disable-expensive-checks --disable-max-stride-tiling --hbm-scratchpad-page-size-in-bytes=536870912 --enable-replication --max-local-tensor-tile-size-in-bytes=32768 --tensor-layout-p-order=0 --tensor-layout-b-order=1 --enable-advanced-delinearization --weight-coalescing-threshold=512 --enable-bir-converter=enable --enable-tritium-loopfusion --keep-remat-dma-transpose --enable-softmax-kernel --model-type-transformer --enable-isl-in-injective-check --enable-dge-on-io-dma --enable-dge-on-spill-reload-dma --enable-dge-on-indirect-dma --enable-dge-on-vector-indirect-dma --keep-rng-tensor-op +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Running LegalizeOpLevelAlias +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/LegalizeOpLevelAlias]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/LegalizeOpLevelAlias]: LegalizeOpLevelAlias finished after 0.052 seconds +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Running OptimizeAliasedCopyChain +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/OptimizeAliasedCopyChain]: OptimizeAliasedCopyChain finished after 0.020 seconds +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.163 seconds +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/TransformConvOp]: Running TransformConvOp +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/TransformConvOp]: Finished (changed=False) +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/TransformConvOp]: TransformConvOp finished after 0.161 seconds +2025-11-04T21:38:39Z INFO 8881 [sg0000/Tensorizer/LowerTensorOp]: Running LowerTensorOp +2025-11-04T21:38:40Z INFO 8881 [sg0000/Tensorizer/LowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8881 [sg0000/Tensorizer/LowerTensorOp]: LowerTensorOp finished after 0.631 seconds +2025-11-04T21:38:40Z INFO 8881 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:40Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:40Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:40Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.015 seconds +2025-11-04T21:38:40Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:41Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:41Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.508 seconds +2025-11-04T21:38:41Z INFO 8881 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.556 seconds +2025-11-04T21:38:41Z INFO 8881 [sg0000/Tensorizer/LegalizeCCOpLayout]: Running LegalizeCCOpLayout +2025-11-04T21:38:41Z INFO 8881 [sg0000/Tensorizer/LegalizeCCOpLayout]: Finished (changed=False) +2025-11-04T21:38:41Z INFO 8881 [sg0000/Tensorizer/LegalizeCCOpLayout]: LegalizeCCOpLayout finished after 0.137 seconds +2025-11-04T21:38:41Z INFO 8881 [sg0000/Tensorizer/TensorOpSimplifier]: Running TensorOpSimplifier +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/TensorOpSimplifier]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/TensorOpSimplifier]: TensorOpSimplifier finished after 0.699 seconds +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/CanonicalizeIR]: Running CanonicalizeIR +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/CanonicalizeIR]: Finished (changed=True) +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/CanonicalizeIR]: CanonicalizeIR finished after 0.108 seconds +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/ResolveComplicatePredicates]: Running ResolveComplicatePredicates +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/ResolveComplicatePredicates]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/ResolveComplicatePredicates]: ResolveComplicatePredicates finished after 0.075 seconds +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/AffinePredicateResolution]: Running AffinePredicateResolution +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/AffinePredicateResolution]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/AffinePredicateResolution]: AffinePredicateResolution finished after 0.093 seconds +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/EliminateDivs]: Running EliminateDivs +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/EliminateDivs]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/EliminateDivs]: EliminateDivs finished after 0.239 seconds +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.056 seconds +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:42Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.320 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.318 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.195 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.834 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.070 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.098 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.078 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.079 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/ExpandBatchNorm]: Running ExpandBatchNorm +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/ExpandBatchNorm]: Finished (changed=False) +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/ExpandBatchNorm]: ExpandBatchNorm finished after 0.116 seconds +2025-11-04T21:38:43Z INFO 8881 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:38:44Z INFO 8881 [sg0000/Tensorizer/TCTransform]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8881 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.080 seconds +2025-11-04T21:38:44Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:38:44Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:38:44Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.073 seconds +2025-11-04T21:38:44Z INFO 8881 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform +2025-11-04T21:38:44Z INFO 8881 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_0 +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_0 finished after 1.969 seconds +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/TensorOpTransform]: Running TensorOpTransform_iteration_1 +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform_iteration_1 finished after 0.340 seconds +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/TensorOpTransform]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/TensorOpTransform]: TensorOpTransform finished after 2.311 seconds +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/LateLowerTensorOp]: Running LateLowerTensorOp +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/LateLowerTensorOp]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/LateLowerTensorOp]: LateLowerTensorOp finished after 0.361 seconds +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/AliasDependencyReset]: Running AliasDependencyReset +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.009 seconds +2025-11-04T21:38:46Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: Running AliasDependencyInduction +2025-11-04T21:38:47Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:38:47Z INFO 8881 [sg0000/Tensorizer/AliasDependencyInduction]: AliasDependencyInduction finished after 0.813 seconds +2025-11-04T21:38:47Z INFO 8881 [sg0000/Tensorizer/AliasDependencyReset]: AliasDependencyReset finished after 0.852 seconds +2025-11-04T21:38:47Z INFO 8881 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination +2025-11-04T21:38:47Z INFO 8881 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_0 +2025-11-04T21:38:56Z INFO 8881 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_0 finished after 8.685 seconds +2025-11-04T21:38:56Z INFO 8881 [sg0000/Tensorizer/MemcpyElimination]: Running MemcpyElimination_iteration_1 +2025-11-04T21:38:56Z INFO 8881 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination_iteration_1 finished after 0.502 seconds +2025-11-04T21:38:56Z INFO 8881 [sg0000/Tensorizer/MemcpyElimination]: Finished (changed=True) +2025-11-04T21:38:56Z INFO 8881 [sg0000/Tensorizer/MemcpyElimination]: MemcpyElimination finished after 9.189 seconds +2025-11-04T21:38:56Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:38:56Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:57Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.937 seconds +2025-11-04T21:38:57Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:58Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.331 seconds +2025-11-04T21:38:58Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_2 +2025-11-04T21:38:58Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_2 finished after 0.248 seconds +2025-11-04T21:38:58Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:38:58Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.531 seconds +2025-11-04T21:38:58Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.437 seconds +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 2.517 seconds +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/Rematerialization]: Running Rematerialization +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/Rematerialization]: Finished (changed=True) +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/Rematerialization]: Rematerialization finished after 0.212 seconds +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.222 seconds +2025-11-04T21:38:59Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:00Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.163 seconds +2025-11-04T21:39:00Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_2 +2025-11-04T21:39:00Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_2 finished after 0.162 seconds +2025-11-04T21:39:00Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:00Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.548 seconds +2025-11-04T21:39:00Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:01Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Finished (changed=True) +2025-11-04T21:39:01Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.913 seconds +2025-11-04T21:39:01Z INFO 8881 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.662 seconds +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.135 seconds +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.137 seconds +2025-11-04T21:39:02Z INFO 8881 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LICM]: LICM finished after 0.073 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.163 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.199 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_1 +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_1 finished after 0.120 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.237 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.564 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/SimplifySlice]: Running SimplifySlice +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/SimplifySlice]: Finished (changed=False) +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/SimplifySlice]: SimplifySlice finished after 0.032 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/LICM]: LICM finished after 0.058 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.108 seconds +2025-11-04T21:39:03Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_1 +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_1 finished after 0.105 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.214 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=True) +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.126 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LICM]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LICM]: LICM finished after 0.058 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/PadElimination]: Running PadElimination +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/PadElimination]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/PadElimination]: PadElimination finished after 0.021 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.155 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.113 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Running LoopFusion_iteration_0 +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion_iteration_0 finished after 0.192 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/LoopFusion]: LoopFusion finished after 0.383 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.030 seconds +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:04Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.105 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.106 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/LICM]: LICM finished after 0.056 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/ValueNumbering]: Running ValueNumbering +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/ValueNumbering]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/ValueNumbering]: ValueNumbering finished after 0.101 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/TCTransform]: Running TCTransform +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/TCTransform]: Finished (changed=True) +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/TCTransform]: TCTransform finished after 0.034 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: Running CommuteConcat_iteration_0 +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/CommuteConcat]: CommuteConcat finished after 0.033 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/RecognizeOpIdiom]: Running RecognizeOpIdiom_iteration_0 +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom_iteration_0 finished after 0.127 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/RecognizeOpIdiom]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/RecognizeOpIdiom]: RecognizeOpIdiom finished after 0.128 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.095 seconds +2025-11-04T21:39:05Z INFO 8881 [sg0000/Tensorizer/DeadStoreElimination]: Running DeadStoreElimination +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DeadStoreElimination]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DeadStoreElimination]: DeadStoreElimination finished after 1.157 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Recompute]: Running Recompute +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Recompute]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Recompute]: Recompute finished after 0.010 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.032 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.032 seconds +2025-11-04T21:39:06Z INFO 8881 [Tensorizer]: After optimization: 958 statements +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/MutateDataType]: Running MutateDataType +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/MutateDataType]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/MutateDataType]: MutateDataType finished after 0.042 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Running GenericAccessSimplifier +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/GenericAccessSimplifier]: GenericAccessSimplifier finished after 0.028 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Running Simplifier_iteration_0 +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier_iteration_0 finished after 0.101 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Finished (changed=False) +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/Simplifier]: Simplifier finished after 0.102 seconds +2025-11-04T21:39:06Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: Running TileCCOps +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=32768 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: in bfloat16 (2048, 8) %'all_gather.1' = AllGatherOp-402 AllGather_add(bfloat16 (1024, 8) %'transpose.1', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((2048, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.50 | hlo_id: 50 | , id = 402 +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `All gather output tensor check failed` +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: in float32 (512, 8) %'all_gather.2' = AllGatherOp-9135 AllGather_add(float32 (256, 8) %'transpose.537', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9754 | hlo_id: 9754 | , id = 9135 +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: pass did not tile CC tensor due to `multi_rank_size=16384 is not above min_allgather_tile_size_in_bytes=8388608` +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: in uint32 (512, 8) %'all_gather.3' = AllGatherOp-9151 AllGather_add(uint32 (256, 8) %'transpose.538', replica_groups = [[0, 1]],all_gather_dim = DimensionSet((512, 8), {0}),stream_id = -1) # dl = tensor_op_name: _all-gather.9893 | hlo_id: 9893 | , id = 9151 +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/TileCCOps]: TileCCOps finished after 0.187 seconds +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.615 seconds +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.136 seconds +2025-11-04T21:39:07Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.334 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination_iteration_0 finished after 0.032 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/DeadCodeElimination]: DeadCodeElimination finished after 0.033 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/LateLowerReshapeOp]: Running LateLowerReshapeOp +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/LateLowerReshapeOp]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/LateLowerReshapeOp]: LateLowerReshapeOp finished after 0.038 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/InferIntrinsicOnCC]: Running InferIntrinsicOnCC +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/InferIntrinsicOnCC]: Finished (changed=False) +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/InferIntrinsicOnCC]: InferIntrinsicOnCC finished after 0.338 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/ResolveAccessConflict]: Running ResolveAccessConflict +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_0 finished after 0.033 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/ResolveAccessConflict]: Running DeadCodeElimination_iteration_1 +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/ResolveAccessConflict]: DeadCodeElimination_iteration_1 finished after 0.031 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/ResolveAccessConflict]: Finished (changed=True) +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/ResolveAccessConflict]: ResolveAccessConflict finished after 0.194 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/LICM]: LICM finished after 0.058 seconds +2025-11-04T21:39:08Z INFO 8881 [sg0000/Tensorizer/LocalLayoutOpt]: Running LocalLayoutOpt +2025-11-04T21:39:09Z INFO 8881 [sg0000/Tensorizer/LocalLayoutOpt]: Finished (changed=False) +2025-11-04T21:39:09Z INFO 8881 [sg0000/Tensorizer/LocalLayoutOpt]: LocalLayoutOpt finished after 0.693 seconds +2025-11-04T21:39:09Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:09Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=False) +2025-11-04T21:39:09Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.378 seconds +2025-11-04T21:39:09Z INFO 8881 [sg0000/Tensorizer/PGLayoutTilingPipeline]: Running PGLayoutTilingPipeline +2025-11-04T21:39:09Z INFO 8881 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:10Z INFO 8881 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8881 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.243 seconds +2025-11-04T21:39:10Z INFO 8881 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: Running LayoutPreprocessingAndAnalysis +2025-11-04T21:39:10Z INFO 8881 [sg0000/Tensorizer/LayoutPreprocessing]: Running LayoutPreprocessing +2025-11-04T21:39:10Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:10Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:10Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.137 seconds +2025-11-04T21:39:11Z INFO 8881 [sg0000/Tensorizer/LayoutPreprocessing]: Finished (changed=True) +2025-11-04T21:39:11Z INFO 8881 [sg0000/Tensorizer/LayoutPreprocessing]: LayoutPreprocessing finished after 0.931 seconds +2025-11-04T21:39:11Z INFO 8881 [sg0000/Tensorizer/LayoutRequirementAnalysis]: Running LayoutRequirementAnalysis +2025-11-04T21:39:11Z INFO 8881 [sg0000/Tensorizer/LayoutRequirementAnalysis]: LayoutRequirementAnalysis finished after 0.447 seconds +2025-11-04T21:39:11Z INFO 8881 [sg0000/Tensorizer/LayoutPreprocessingAndAnalysis]: LayoutPreprocessingAndAnalysis finished after 1.388 seconds +2025-11-04T21:39:11Z INFO 8881 [sg0000/Tensorizer/InferNonlocalTensors]: Running InferNonlocalTensors +2025-11-04T21:39:11Z INFO 8881 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:13Z INFO 8881 [sg0000/Tensorizer/InferNonlocalTensors]: prefer_non_broadcast_par: True +2025-11-04T21:39:17Z INFO 8881 [sg0000/Tensorizer/InferNonlocalTensors]: Finished (changed=False) +2025-11-04T21:39:17Z INFO 8881 [sg0000/Tensorizer/InferNonlocalTensors]: InferNonlocalTensors finished after 5.598 seconds +2025-11-04T21:39:17Z INFO 8881 [sg0000/Tensorizer/PAGLayoutOpt]: Running PAGLayoutOpt +2025-11-04T21:39:17Z INFO 8881 [sg0000/Tensorizer/ParAxesAnnotation]: Running ParAxesAnnotation +2025-11-04T21:39:17Z INFO 8881 [sg0000/Tensorizer/LayoutSearchAlgorithm]: prefer_non_broadcast_par: True +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/ParAxesAnnotation]: Finished (changed=True) +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/ParAxesAnnotation]: ParAxesAnnotation finished after 14.887 seconds +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/InsertLocalTransposes]: Running InsertLocalTransposes +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/InsertLocalTransposes]: Finished (changed=True) +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/InsertLocalTransposes]: InsertLocalTransposes finished after 0.733 seconds +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/PAGLayoutOpt]: PAGLayoutOpt finished after 15.631 seconds +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/DelinearizeSPMD]: Running DelinearizeSPMD +2025-11-04T21:39:32Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Running Delinearization +2025-11-04T21:39:33Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8881 [sg0000/Tensorizer/Delinearization]: Delinearization finished after 0.168 seconds +2025-11-04T21:39:33Z INFO 8881 [sg0000/Tensorizer/DelinearizeSPMD]: Finished (changed=False) +2025-11-04T21:39:33Z INFO 8881 [sg0000/Tensorizer/DelinearizeSPMD]: DelinearizeSPMD finished after 0.214 seconds +2025-11-04T21:39:33Z INFO 8881 [sg0000/Tensorizer/ShardingPropagationAnalysis]: Running ShardingPropagationAnalysis +2025-11-04T21:39:33Z INFO 8881 [sg0000/Tensorizer/ShardingPropagationAnalysis]: ShardingPropagationAnalysis finished after 0.734 seconds +2025-11-04T21:39:33Z INFO 8881 [sg0000/Tensorizer/InferShardAxis]: Running InferShardAxis +2025-11-04T21:39:40Z INFO 8881 [sg0000/Tensorizer/ShardResult]: =================== Dumping Debug Info ===================== +2025-11-04T21:39:40Z INFO 8881 [sg0000/Tensorizer/ShardResult]: ------------------ Sharding summary ------------------ +total number of dags: 1159 +total number of sharded dags: 408 + +total bytes transferred from input, output, non local tensors: 3606653256 +total bytes transferred from input, output, non local tensors with 2x bandwidths: 3604021380 +% bytes transferred with 2x bandwidths: 99.93 + +NC0 FLOPs: 8807531673 +NC1 FLOPs: 8800192384 +% FLOPs sharded: 99.95 + + +Shard dim: 4096, Number of dags: 197 +Matmuls sharded with this dim: +[2,2,64] @ [2,64,4096(s)] = [2,4096(s)] Number of occurrences: 28 +[2,4096(s)] @ [4096(s),128] = [2,128] Number of occurrences: 28 + + +Shard dim: 2, Number of dags: 196 +Matmuls sharded with this dim: +[8,2(s),6,2,128] @ [2(s),6,2,128,8,2,128] = [8,8,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,128] = [8,2,2,128] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,2,64] = [8,2,2,2,2,64] Number of occurrences: 28 +[8,2(s),8,128] @ [2(s),8,128,2,2,2,64] = [8,2,2,2,64] Number of occurrences: 28 +[8,2,2,2,128] @ [2,2,2,128,2(s),2,4,128] = [8,2(s),2,4,128] Number of occurrences: 28 +[8,2,8,128] @ [2,8,128,2(s),6,2,128] = [8,2(s),6,2,128] Number of occurrences: 56 + + +Shard dim: 256, Number of dags: 10 +Matmuls sharded with this dim: + + +Shard dim: 8, Number of dags: 2 +Matmuls sharded with this dim: + + +Shard dim: 1024, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 512, Number of dags: 1 +Matmuls sharded with this dim: + + +Shard dim: 75968, Number of dags: 1 +Matmuls sharded with this dim: +[8,2,8,128] @ [2,8,128,75968(s)] = [8,75968(s)] Number of occurrences: 1 + + + +2025-11-04T21:39:41Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Running DelinearIndices +2025-11-04T21:39:41Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: Finished (changed=True) +2025-11-04T21:39:41Z INFO 8881 [sg0000/Tensorizer/DelinearIndices]: DelinearIndices finished after 0.429 seconds +2025-11-04T21:39:41Z INFO 8881 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Running RemoveShardedPartitionAxes +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/RemoveShardedPartitionAxes]: RemoveShardedPartitionAxes finished after 1.250 seconds +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/InferShardAxis]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/InferShardAxis]: InferShardAxis finished after 9.272 seconds +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/MaskPropagation]: Running MaskPropagation +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/MaskPropagation]: Finished (changed=False) +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/MaskPropagation]: MaskPropagation finished after 0.115 seconds +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Running CanonicalizeDAGForPGTiling +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: Finished (changed=True) +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/CanonicalizeDAGForPGTiling]: CanonicalizeDAGForPGTiling finished after 0.179 seconds +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Running LowerCCOpBlockAxis +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/LowerCCOpBlockAxis]: Finished (changed=False) +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/LowerCCOpBlockAxis]: LowerCCOpBlockAxis finished after 0.226 seconds +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/PGTiling]: Running PGTiling +2025-11-04T21:39:43Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: Running AGOrderingAnalysisPass +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9404 of IO tensor {'CrossPassTensor': ''}bfloat16 %input4|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (79, 'AG3736'), (80, 'AG3735'), (218, 'AG3727'), (474, 'AG3726'), (274, 'AG3733')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9698 of IO tensor {'CrossPassTensor': ''}bfloat16 %input6|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (83, 'AG3752'), (84, 'AG3751'), (218, 'AG3727'), (474, 'AG3726'), (272, 'AG3749')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 9949 of IO tensor {'CrossPassTensor': ''}bfloat16 %input8|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (87, 'AG3768'), (88, 'AG3767'), (218, 'AG3727'), (474, 'AG3726'), (270, 'AG3765')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10200 of IO tensor {'CrossPassTensor': ''}bfloat16 %input10|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (91, 'AG3784'), (92, 'AG3783'), (218, 'AG3727'), (474, 'AG3726'), (268, 'AG3781')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10451 of IO tensor {'CrossPassTensor': ''}bfloat16 %input12|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (95, 'AG3800'), (96, 'AG3799'), (218, 'AG3727'), (474, 'AG3726'), (266, 'AG3797')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10702 of IO tensor {'CrossPassTensor': ''}bfloat16 %input14|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (99, 'AG3816'), (100, 'AG3815'), (218, 'AG3727'), (474, 'AG3726'), (264, 'AG3813')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 10953 of IO tensor {'CrossPassTensor': ''}bfloat16 %input16|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (103, 'AG3832'), (104, 'AG3831'), (218, 'AG3727'), (474, 'AG3726'), (262, 'AG3829')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11204 of IO tensor {'CrossPassTensor': ''}bfloat16 %input18|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (107, 'AG3848'), (108, 'AG3847'), (218, 'AG3727'), (474, 'AG3726'), (260, 'AG3845')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11455 of IO tensor {'CrossPassTensor': ''}bfloat16 %input20|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (111, 'AG3864'), (112, 'AG3863'), (218, 'AG3727'), (474, 'AG3726'), (258, 'AG3861')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11706 of IO tensor {'CrossPassTensor': ''}bfloat16 %input22|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (115, 'AG3880'), (116, 'AG3879'), (218, 'AG3727'), (474, 'AG3726'), (256, 'AG3877')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 11957 of IO tensor {'CrossPassTensor': ''}bfloat16 %input24|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (119, 'AG3896'), (120, 'AG3895'), (218, 'AG3727'), (474, 'AG3726'), (254, 'AG3893')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12208 of IO tensor {'CrossPassTensor': ''}bfloat16 %input26|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (123, 'AG3912'), (124, 'AG3911'), (218, 'AG3727'), (474, 'AG3726'), (252, 'AG3909')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input28|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (127, 'AG3928'), (128, 'AG3927'), (218, 'AG3727'), (474, 'AG3726'), (250, 'AG3925')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12710 of IO tensor {'CrossPassTensor': ''}bfloat16 %input30|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (131, 'AG3944'), (132, 'AG3943'), (218, 'AG3727'), (474, 'AG3726'), (248, 'AG3941')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 12961 of IO tensor {'CrossPassTensor': ''}bfloat16 %input32|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (135, 'AG3960'), (136, 'AG3959'), (218, 'AG3727'), (474, 'AG3726'), (246, 'AG3957')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13212 of IO tensor {'CrossPassTensor': ''}bfloat16 %input34|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (139, 'AG3976'), (140, 'AG3975'), (218, 'AG3727'), (474, 'AG3726'), (244, 'AG3973')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13463 of IO tensor {'CrossPassTensor': ''}bfloat16 %input36|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (143, 'AG3992'), (144, 'AG3991'), (218, 'AG3727'), (474, 'AG3726'), (242, 'AG3989')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13714 of IO tensor {'CrossPassTensor': ''}bfloat16 %input38|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (147, 'AG4008'), (148, 'AG4007'), (218, 'AG3727'), (474, 'AG3726'), (240, 'AG4005')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 13965 of IO tensor {'CrossPassTensor': ''}bfloat16 %input40|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (151, 'AG4024'), (152, 'AG4023'), (218, 'AG3727'), (474, 'AG3726'), (238, 'AG4021')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14216 of IO tensor {'CrossPassTensor': ''}bfloat16 %input42|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (155, 'AG4040'), (156, 'AG4039'), (218, 'AG3727'), (474, 'AG3726'), (236, 'AG4037')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14467 of IO tensor {'CrossPassTensor': ''}bfloat16 %input44|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (159, 'AG4056'), (160, 'AG4055'), (218, 'AG3727'), (474, 'AG3726'), (234, 'AG4053')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14718 of IO tensor {'CrossPassTensor': ''}bfloat16 %input46|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (163, 'AG4072'), (164, 'AG4071'), (218, 'AG3727'), (474, 'AG3726'), (232, 'AG4069')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 14969 of IO tensor {'CrossPassTensor': ''}bfloat16 %input48|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (167, 'AG4088'), (168, 'AG4087'), (218, 'AG3727'), (474, 'AG3726'), (230, 'AG4085')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15220 of IO tensor {'CrossPassTensor': ''}bfloat16 %input50|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (171, 'AG4104'), (172, 'AG4103'), (218, 'AG3727'), (474, 'AG3726'), (228, 'AG4101')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input52|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (175, 'AG4120'), (176, 'AG4119'), (218, 'AG3727'), (474, 'AG3726'), (226, 'AG4117')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15722 of IO tensor {'CrossPassTensor': ''}bfloat16 %input54|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (179, 'AG4136'), (180, 'AG4135'), (218, 'AG3727'), (474, 'AG3726'), (224, 'AG4133')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 15973 of IO tensor {'CrossPassTensor': ''}bfloat16 %input56|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (183, 'AG4152'), (184, 'AG4151'), (218, 'AG3727'), (474, 'AG3726'), (222, 'AG4149')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16224 of IO tensor {'CrossPassTensor': ''}bfloat16 %input58|NHWC|(8, 2, 2, 2, 2048, 2, 64) is not sorted, index list (w/ AG ids): [(77, 'AG3728'), (187, 'AG4168'), (188, 'AG4167'), (218, 'AG3727'), (474, 'AG3726'), (220, 'AG4165')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23229 of IO tensor {'CrossPassTensor': ''}bfloat16 %input61|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(78, 'AG3741'), (273, 'AG3740'), (79, 'AG3736'), (80, 'AG3735'), (81, 'AG3734'), (358, 'AG3739'), (470, 'AG3738')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23228 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23228 of IO tensor {'CrossPassTensor': ''}bfloat16 %input62|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (471, 'AG3737')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23219 of IO tensor {'CrossPassTensor': ''}bfloat16 %input63|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23224 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23224 of IO tensor {'CrossPassTensor': ''}bfloat16 %input65|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (79, 'AG3736'), (80, 'AG3735'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23220 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23220 of IO tensor {'CrossPassTensor': ''}bfloat16 %input67|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(79, 'AG3736'), (191, 'AG3731'), (80, 'AG3735'), (81, 'AG3734'), (274, 'AG3733'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23233 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(469, 'AG3742'), (74, 'AG3744'), (357, 'AG3743')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23233 of IO tensor {'CrossPassTensor': ''}bfloat16 %input68|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(37, 'AG3748'), (1, 'AG3745'), (356, 'AG3747'), (468, 'AG3746')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23232 of IO tensor {'CrossPassTensor': ''}bfloat16 %input69|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23230 of IO tensor {'CrossPassTensor': ''}bfloat16 %input70|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23231 of IO tensor {'CrossPassTensor': ''}bfloat16 %input71|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23244 of IO tensor {'CrossPassTensor': ''}bfloat16 %input72|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(82, 'AG3757'), (271, 'AG3756'), (83, 'AG3752'), (84, 'AG3751'), (85, 'AG3750'), (355, 'AG3755'), (466, 'AG3754')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23243 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23243 of IO tensor {'CrossPassTensor': ''}bfloat16 %input73|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (467, 'AG3753')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23234 of IO tensor {'CrossPassTensor': ''}bfloat16 %input74|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23239 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23239 of IO tensor {'CrossPassTensor': ''}bfloat16 %input76|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (83, 'AG3752'), (84, 'AG3751'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23235 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23235 of IO tensor {'CrossPassTensor': ''}bfloat16 %input78|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(83, 'AG3752'), (191, 'AG3731'), (84, 'AG3751'), (85, 'AG3750'), (272, 'AG3749'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23248 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(465, 'AG3758'), (75, 'AG3760'), (354, 'AG3759')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23248 of IO tensor {'CrossPassTensor': ''}bfloat16 %input79|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(38, 'AG3764'), (2, 'AG3761'), (353, 'AG3763'), (464, 'AG3762')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23247 of IO tensor {'CrossPassTensor': ''}bfloat16 %input80|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23245 of IO tensor {'CrossPassTensor': ''}bfloat16 %input81|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23246 of IO tensor {'CrossPassTensor': ''}bfloat16 %input82|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23259 of IO tensor {'CrossPassTensor': ''}bfloat16 %input83|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(86, 'AG3773'), (269, 'AG3772'), (87, 'AG3768'), (88, 'AG3767'), (89, 'AG3766'), (352, 'AG3771'), (462, 'AG3770')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23258 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23258 of IO tensor {'CrossPassTensor': ''}bfloat16 %input84|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (463, 'AG3769')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23249 of IO tensor {'CrossPassTensor': ''}bfloat16 %input85|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23254 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23254 of IO tensor {'CrossPassTensor': ''}bfloat16 %input87|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (87, 'AG3768'), (88, 'AG3767'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23250 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23250 of IO tensor {'CrossPassTensor': ''}bfloat16 %input89|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(87, 'AG3768'), (191, 'AG3731'), (88, 'AG3767'), (89, 'AG3766'), (270, 'AG3765'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23263 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(461, 'AG3774'), (76, 'AG3776'), (351, 'AG3775')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23263 of IO tensor {'CrossPassTensor': ''}bfloat16 %input90|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(39, 'AG3780'), (3, 'AG3777'), (350, 'AG3779'), (460, 'AG3778')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23262 of IO tensor {'CrossPassTensor': ''}bfloat16 %input91|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23260 of IO tensor {'CrossPassTensor': ''}bfloat16 %input92|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23261 of IO tensor {'CrossPassTensor': ''}bfloat16 %input93|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23274 of IO tensor {'CrossPassTensor': ''}bfloat16 %input94|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(90, 'AG3789'), (267, 'AG3788'), (91, 'AG3784'), (92, 'AG3783'), (93, 'AG3782'), (349, 'AG3787'), (458, 'AG3786')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23273 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23273 of IO tensor {'CrossPassTensor': ''}bfloat16 %input95|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (459, 'AG3785')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23264 of IO tensor {'CrossPassTensor': ''}bfloat16 %input96|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23269 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23269 of IO tensor {'CrossPassTensor': ''}bfloat16 %input98|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (91, 'AG3784'), (92, 'AG3783'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23265 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23265 of IO tensor {'CrossPassTensor': ''}bfloat16 %input100|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(91, 'AG3784'), (191, 'AG3731'), (92, 'AG3783'), (93, 'AG3782'), (268, 'AG3781'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23278 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(457, 'AG3790'), (192, 'AG3792'), (348, 'AG3791')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23278 of IO tensor {'CrossPassTensor': ''}bfloat16 %input101|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(40, 'AG3796'), (4, 'AG3793'), (347, 'AG3795'), (456, 'AG3794')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23277 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23277 of IO tensor {'CrossPassTensor': ''}bfloat16 %input102|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23275 of IO tensor {'CrossPassTensor': ''}bfloat16 %input103|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23276 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23276 of IO tensor {'CrossPassTensor': ''}bfloat16 %input104|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(4, 'AG3793'), (192, 'AG3792'), (191, 'AG3731'), (348, 'AG3791'), (457, 'AG3790')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23289 of IO tensor {'CrossPassTensor': ''}bfloat16 %input105|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(94, 'AG3805'), (265, 'AG3804'), (95, 'AG3800'), (96, 'AG3799'), (97, 'AG3798'), (346, 'AG3803'), (454, 'AG3802')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23288 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23288 of IO tensor {'CrossPassTensor': ''}bfloat16 %input106|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (455, 'AG3801')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23279 of IO tensor {'CrossPassTensor': ''}bfloat16 %input107|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23284 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23284 of IO tensor {'CrossPassTensor': ''}bfloat16 %input109|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (95, 'AG3800'), (96, 'AG3799'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23280 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23280 of IO tensor {'CrossPassTensor': ''}bfloat16 %input111|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(95, 'AG3800'), (191, 'AG3731'), (96, 'AG3799'), (97, 'AG3798'), (266, 'AG3797'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23293 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(453, 'AG3806'), (193, 'AG3808'), (345, 'AG3807')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23293 of IO tensor {'CrossPassTensor': ''}bfloat16 %input112|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(41, 'AG3812'), (5, 'AG3809'), (344, 'AG3811'), (452, 'AG3810')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23292 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23292 of IO tensor {'CrossPassTensor': ''}bfloat16 %input113|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23290 of IO tensor {'CrossPassTensor': ''}bfloat16 %input114|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23291 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23291 of IO tensor {'CrossPassTensor': ''}bfloat16 %input115|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(5, 'AG3809'), (193, 'AG3808'), (191, 'AG3731'), (345, 'AG3807'), (453, 'AG3806')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23304 of IO tensor {'CrossPassTensor': ''}bfloat16 %input116|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(98, 'AG3821'), (263, 'AG3820'), (99, 'AG3816'), (100, 'AG3815'), (101, 'AG3814'), (343, 'AG3819'), (450, 'AG3818')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23303 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23303 of IO tensor {'CrossPassTensor': ''}bfloat16 %input117|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (451, 'AG3817')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23294 of IO tensor {'CrossPassTensor': ''}bfloat16 %input118|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23299 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23299 of IO tensor {'CrossPassTensor': ''}bfloat16 %input120|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (99, 'AG3816'), (100, 'AG3815'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23295 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23295 of IO tensor {'CrossPassTensor': ''}bfloat16 %input122|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(99, 'AG3816'), (191, 'AG3731'), (100, 'AG3815'), (101, 'AG3814'), (264, 'AG3813'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23308 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(449, 'AG3822'), (194, 'AG3824'), (342, 'AG3823')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23308 of IO tensor {'CrossPassTensor': ''}bfloat16 %input123|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(42, 'AG3828'), (6, 'AG3825'), (341, 'AG3827'), (448, 'AG3826')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23307 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23307 of IO tensor {'CrossPassTensor': ''}bfloat16 %input124|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23305 of IO tensor {'CrossPassTensor': ''}bfloat16 %input125|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23306 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23306 of IO tensor {'CrossPassTensor': ''}bfloat16 %input126|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(6, 'AG3825'), (194, 'AG3824'), (191, 'AG3731'), (342, 'AG3823'), (449, 'AG3822')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23319 of IO tensor {'CrossPassTensor': ''}bfloat16 %input127|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(102, 'AG3837'), (261, 'AG3836'), (103, 'AG3832'), (104, 'AG3831'), (105, 'AG3830'), (340, 'AG3835'), (446, 'AG3834')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23318 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23318 of IO tensor {'CrossPassTensor': ''}bfloat16 %input128|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (447, 'AG3833')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23309 of IO tensor {'CrossPassTensor': ''}bfloat16 %input129|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23314 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23314 of IO tensor {'CrossPassTensor': ''}bfloat16 %input131|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (103, 'AG3832'), (104, 'AG3831'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23310 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23310 of IO tensor {'CrossPassTensor': ''}bfloat16 %input133|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(103, 'AG3832'), (191, 'AG3731'), (104, 'AG3831'), (105, 'AG3830'), (262, 'AG3829'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23323 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(445, 'AG3838'), (195, 'AG3840'), (339, 'AG3839')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23323 of IO tensor {'CrossPassTensor': ''}bfloat16 %input134|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(43, 'AG3844'), (7, 'AG3841'), (338, 'AG3843'), (444, 'AG3842')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23322 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23322 of IO tensor {'CrossPassTensor': ''}bfloat16 %input135|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23320 of IO tensor {'CrossPassTensor': ''}bfloat16 %input136|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23321 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23321 of IO tensor {'CrossPassTensor': ''}bfloat16 %input137|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(7, 'AG3841'), (195, 'AG3840'), (191, 'AG3731'), (339, 'AG3839'), (445, 'AG3838')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23334 of IO tensor {'CrossPassTensor': ''}bfloat16 %input138|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(106, 'AG3853'), (259, 'AG3852'), (107, 'AG3848'), (108, 'AG3847'), (109, 'AG3846'), (337, 'AG3851'), (442, 'AG3850')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23333 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23333 of IO tensor {'CrossPassTensor': ''}bfloat16 %input139|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (443, 'AG3849')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23324 of IO tensor {'CrossPassTensor': ''}bfloat16 %input140|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23329 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23329 of IO tensor {'CrossPassTensor': ''}bfloat16 %input142|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (107, 'AG3848'), (108, 'AG3847'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23325 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23325 of IO tensor {'CrossPassTensor': ''}bfloat16 %input144|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(107, 'AG3848'), (191, 'AG3731'), (108, 'AG3847'), (109, 'AG3846'), (260, 'AG3845'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23338 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(441, 'AG3854'), (196, 'AG3856'), (336, 'AG3855')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23338 of IO tensor {'CrossPassTensor': ''}bfloat16 %input145|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(44, 'AG3860'), (8, 'AG3857'), (335, 'AG3859'), (440, 'AG3858')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23337 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23337 of IO tensor {'CrossPassTensor': ''}bfloat16 %input146|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23335 of IO tensor {'CrossPassTensor': ''}bfloat16 %input147|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23336 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23336 of IO tensor {'CrossPassTensor': ''}bfloat16 %input148|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(8, 'AG3857'), (196, 'AG3856'), (191, 'AG3731'), (336, 'AG3855'), (441, 'AG3854')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23349 of IO tensor {'CrossPassTensor': ''}bfloat16 %input149|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(110, 'AG3869'), (257, 'AG3868'), (111, 'AG3864'), (112, 'AG3863'), (113, 'AG3862'), (334, 'AG3867'), (438, 'AG3866')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23348 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23348 of IO tensor {'CrossPassTensor': ''}bfloat16 %input150|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (439, 'AG3865')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23339 of IO tensor {'CrossPassTensor': ''}bfloat16 %input151|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23344 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23344 of IO tensor {'CrossPassTensor': ''}bfloat16 %input153|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (111, 'AG3864'), (112, 'AG3863'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23340 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23340 of IO tensor {'CrossPassTensor': ''}bfloat16 %input155|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(111, 'AG3864'), (191, 'AG3731'), (112, 'AG3863'), (113, 'AG3862'), (258, 'AG3861'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23353 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(437, 'AG3870'), (197, 'AG3872'), (333, 'AG3871')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23353 of IO tensor {'CrossPassTensor': ''}bfloat16 %input156|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(45, 'AG3876'), (9, 'AG3873'), (332, 'AG3875'), (436, 'AG3874')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23352 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23352 of IO tensor {'CrossPassTensor': ''}bfloat16 %input157|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23350 of IO tensor {'CrossPassTensor': ''}bfloat16 %input158|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23351 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23351 of IO tensor {'CrossPassTensor': ''}bfloat16 %input159|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(9, 'AG3873'), (197, 'AG3872'), (191, 'AG3731'), (333, 'AG3871'), (437, 'AG3870')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23364 of IO tensor {'CrossPassTensor': ''}bfloat16 %input160|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(114, 'AG3885'), (255, 'AG3884'), (115, 'AG3880'), (116, 'AG3879'), (117, 'AG3878'), (331, 'AG3883'), (434, 'AG3882')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23363 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23363 of IO tensor {'CrossPassTensor': ''}bfloat16 %input161|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (435, 'AG3881')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23354 of IO tensor {'CrossPassTensor': ''}bfloat16 %input162|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23359 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23359 of IO tensor {'CrossPassTensor': ''}bfloat16 %input164|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (115, 'AG3880'), (116, 'AG3879'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23355 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23355 of IO tensor {'CrossPassTensor': ''}bfloat16 %input166|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(115, 'AG3880'), (191, 'AG3731'), (116, 'AG3879'), (117, 'AG3878'), (256, 'AG3877'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23368 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(433, 'AG3886'), (198, 'AG3888'), (330, 'AG3887')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23368 of IO tensor {'CrossPassTensor': ''}bfloat16 %input167|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(46, 'AG3892'), (10, 'AG3889'), (329, 'AG3891'), (432, 'AG3890')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23367 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23367 of IO tensor {'CrossPassTensor': ''}bfloat16 %input168|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23365 of IO tensor {'CrossPassTensor': ''}bfloat16 %input169|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23366 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23366 of IO tensor {'CrossPassTensor': ''}bfloat16 %input170|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(10, 'AG3889'), (198, 'AG3888'), (191, 'AG3731'), (330, 'AG3887'), (433, 'AG3886')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23379 of IO tensor {'CrossPassTensor': ''}bfloat16 %input171|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(118, 'AG3901'), (253, 'AG3900'), (119, 'AG3896'), (120, 'AG3895'), (121, 'AG3894'), (328, 'AG3899'), (430, 'AG3898')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23378 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23378 of IO tensor {'CrossPassTensor': ''}bfloat16 %input172|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (431, 'AG3897')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23369 of IO tensor {'CrossPassTensor': ''}bfloat16 %input173|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23374 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23374 of IO tensor {'CrossPassTensor': ''}bfloat16 %input175|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (119, 'AG3896'), (120, 'AG3895'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23370 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23370 of IO tensor {'CrossPassTensor': ''}bfloat16 %input177|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(119, 'AG3896'), (191, 'AG3731'), (120, 'AG3895'), (121, 'AG3894'), (254, 'AG3893'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23383 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(429, 'AG3902'), (199, 'AG3904'), (327, 'AG3903')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23383 of IO tensor {'CrossPassTensor': ''}bfloat16 %input178|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(47, 'AG3908'), (11, 'AG3905'), (326, 'AG3907'), (428, 'AG3906')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23382 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23382 of IO tensor {'CrossPassTensor': ''}bfloat16 %input179|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23380 of IO tensor {'CrossPassTensor': ''}bfloat16 %input180|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23381 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23381 of IO tensor {'CrossPassTensor': ''}bfloat16 %input181|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(11, 'AG3905'), (199, 'AG3904'), (191, 'AG3731'), (327, 'AG3903'), (429, 'AG3902')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23394 of IO tensor {'CrossPassTensor': ''}bfloat16 %input182|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(122, 'AG3917'), (251, 'AG3916'), (123, 'AG3912'), (124, 'AG3911'), (125, 'AG3910'), (325, 'AG3915'), (426, 'AG3914')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23393 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23393 of IO tensor {'CrossPassTensor': ''}bfloat16 %input183|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (427, 'AG3913')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23384 of IO tensor {'CrossPassTensor': ''}bfloat16 %input184|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23389 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23389 of IO tensor {'CrossPassTensor': ''}bfloat16 %input186|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (123, 'AG3912'), (124, 'AG3911'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23385 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23385 of IO tensor {'CrossPassTensor': ''}bfloat16 %input188|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(123, 'AG3912'), (191, 'AG3731'), (124, 'AG3911'), (125, 'AG3910'), (252, 'AG3909'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23398 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(425, 'AG3918'), (200, 'AG3920'), (324, 'AG3919')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23398 of IO tensor {'CrossPassTensor': ''}bfloat16 %input189|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(48, 'AG3924'), (12, 'AG3921'), (323, 'AG3923'), (424, 'AG3922')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23397 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23397 of IO tensor {'CrossPassTensor': ''}bfloat16 %input190|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23395 of IO tensor {'CrossPassTensor': ''}bfloat16 %input191|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23396 of IO tensor {'CrossPassTensor': ''}bfloat16 %input192|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(12, 'AG3921'), (200, 'AG3920'), (191, 'AG3731'), (324, 'AG3919'), (425, 'AG3918')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23409 of IO tensor {'CrossPassTensor': ''}bfloat16 %input193|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(126, 'AG3933'), (249, 'AG3932'), (127, 'AG3928'), (128, 'AG3927'), (129, 'AG3926'), (322, 'AG3931'), (422, 'AG3930')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23408 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23408 of IO tensor {'CrossPassTensor': ''}bfloat16 %input194|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (423, 'AG3929')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23399 of IO tensor {'CrossPassTensor': ''}bfloat16 %input195|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23404 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23404 of IO tensor {'CrossPassTensor': ''}bfloat16 %input197|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (127, 'AG3928'), (128, 'AG3927'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23400 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23400 of IO tensor {'CrossPassTensor': ''}bfloat16 %input199|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(127, 'AG3928'), (191, 'AG3731'), (128, 'AG3927'), (129, 'AG3926'), (250, 'AG3925'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23413 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(421, 'AG3934'), (201, 'AG3936'), (321, 'AG3935')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23413 of IO tensor {'CrossPassTensor': ''}bfloat16 %input200|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(49, 'AG3940'), (13, 'AG3937'), (320, 'AG3939'), (420, 'AG3938')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23412 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23412 of IO tensor {'CrossPassTensor': ''}bfloat16 %input201|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23410 of IO tensor {'CrossPassTensor': ''}bfloat16 %input202|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23411 of IO tensor {'CrossPassTensor': ''}bfloat16 %input203|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(13, 'AG3937'), (201, 'AG3936'), (191, 'AG3731'), (321, 'AG3935'), (421, 'AG3934')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23424 of IO tensor {'CrossPassTensor': ''}bfloat16 %input204|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(130, 'AG3949'), (247, 'AG3948'), (131, 'AG3944'), (132, 'AG3943'), (133, 'AG3942'), (319, 'AG3947'), (418, 'AG3946')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23423 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23423 of IO tensor {'CrossPassTensor': ''}bfloat16 %input205|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (419, 'AG3945')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23414 of IO tensor {'CrossPassTensor': ''}bfloat16 %input206|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23419 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23419 of IO tensor {'CrossPassTensor': ''}bfloat16 %input208|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (131, 'AG3944'), (132, 'AG3943'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23415 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23415 of IO tensor {'CrossPassTensor': ''}bfloat16 %input210|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(131, 'AG3944'), (191, 'AG3731'), (132, 'AG3943'), (133, 'AG3942'), (248, 'AG3941'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23428 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(417, 'AG3950'), (202, 'AG3952'), (318, 'AG3951')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23428 of IO tensor {'CrossPassTensor': ''}bfloat16 %input211|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(50, 'AG3956'), (14, 'AG3953'), (317, 'AG3955'), (416, 'AG3954')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23427 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23427 of IO tensor {'CrossPassTensor': ''}bfloat16 %input212|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23425 of IO tensor {'CrossPassTensor': ''}bfloat16 %input213|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23426 of IO tensor {'CrossPassTensor': ''}bfloat16 %input214|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(14, 'AG3953'), (202, 'AG3952'), (191, 'AG3731'), (318, 'AG3951'), (417, 'AG3950')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23439 of IO tensor {'CrossPassTensor': ''}bfloat16 %input215|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(134, 'AG3965'), (245, 'AG3964'), (135, 'AG3960'), (136, 'AG3959'), (137, 'AG3958'), (316, 'AG3963'), (414, 'AG3962')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23438 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23438 of IO tensor {'CrossPassTensor': ''}bfloat16 %input216|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (415, 'AG3961')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23429 of IO tensor {'CrossPassTensor': ''}bfloat16 %input217|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23434 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23434 of IO tensor {'CrossPassTensor': ''}bfloat16 %input219|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (135, 'AG3960'), (136, 'AG3959'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23430 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23430 of IO tensor {'CrossPassTensor': ''}bfloat16 %input221|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(135, 'AG3960'), (191, 'AG3731'), (136, 'AG3959'), (137, 'AG3958'), (246, 'AG3957'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(413, 'AG3966'), (203, 'AG3968'), (315, 'AG3967')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23443 of IO tensor {'CrossPassTensor': ''}bfloat16 %input222|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(51, 'AG3972'), (15, 'AG3969'), (314, 'AG3971'), (412, 'AG3970')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23442 of IO tensor {'CrossPassTensor': ''}bfloat16 %input223|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23440 of IO tensor {'CrossPassTensor': ''}bfloat16 %input224|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23441 of IO tensor {'CrossPassTensor': ''}bfloat16 %input225|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(15, 'AG3969'), (203, 'AG3968'), (191, 'AG3731'), (315, 'AG3967'), (413, 'AG3966')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23454 of IO tensor {'CrossPassTensor': ''}bfloat16 %input226|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(138, 'AG3981'), (243, 'AG3980'), (139, 'AG3976'), (140, 'AG3975'), (141, 'AG3974'), (313, 'AG3979'), (410, 'AG3978')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23453 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23453 of IO tensor {'CrossPassTensor': ''}bfloat16 %input227|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (411, 'AG3977')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23444 of IO tensor {'CrossPassTensor': ''}bfloat16 %input228|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23449 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23449 of IO tensor {'CrossPassTensor': ''}bfloat16 %input230|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (139, 'AG3976'), (140, 'AG3975'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23445 of IO tensor {'CrossPassTensor': ''}bfloat16 %input232|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(139, 'AG3976'), (191, 'AG3731'), (140, 'AG3975'), (141, 'AG3974'), (244, 'AG3973'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(409, 'AG3982'), (204, 'AG3984'), (312, 'AG3983')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23458 of IO tensor {'CrossPassTensor': ''}bfloat16 %input233|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(52, 'AG3988'), (16, 'AG3985'), (311, 'AG3987'), (408, 'AG3986')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23457 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23457 of IO tensor {'CrossPassTensor': ''}bfloat16 %input234|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23455 of IO tensor {'CrossPassTensor': ''}bfloat16 %input235|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23456 of IO tensor {'CrossPassTensor': ''}bfloat16 %input236|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(16, 'AG3985'), (204, 'AG3984'), (191, 'AG3731'), (312, 'AG3983'), (409, 'AG3982')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23469 of IO tensor {'CrossPassTensor': ''}bfloat16 %input237|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(142, 'AG3997'), (241, 'AG3996'), (143, 'AG3992'), (144, 'AG3991'), (145, 'AG3990'), (310, 'AG3995'), (406, 'AG3994')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23468 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23468 of IO tensor {'CrossPassTensor': ''}bfloat16 %input238|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (407, 'AG3993')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23459 of IO tensor {'CrossPassTensor': ''}bfloat16 %input239|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23464 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23464 of IO tensor {'CrossPassTensor': ''}bfloat16 %input241|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (143, 'AG3992'), (144, 'AG3991'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23460 of IO tensor {'CrossPassTensor': ''}bfloat16 %input243|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(143, 'AG3992'), (191, 'AG3731'), (144, 'AG3991'), (145, 'AG3990'), (242, 'AG3989'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23473 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(405, 'AG3998'), (205, 'AG4000'), (309, 'AG3999')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23473 of IO tensor {'CrossPassTensor': ''}bfloat16 %input244|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(53, 'AG4004'), (17, 'AG4001'), (308, 'AG4003'), (404, 'AG4002')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23472 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23472 of IO tensor {'CrossPassTensor': ''}bfloat16 %input245|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23470 of IO tensor {'CrossPassTensor': ''}bfloat16 %input246|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23471 of IO tensor {'CrossPassTensor': ''}bfloat16 %input247|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(17, 'AG4001'), (205, 'AG4000'), (191, 'AG3731'), (309, 'AG3999'), (405, 'AG3998')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23484 of IO tensor {'CrossPassTensor': ''}bfloat16 %input248|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(146, 'AG4013'), (239, 'AG4012'), (147, 'AG4008'), (148, 'AG4007'), (149, 'AG4006'), (307, 'AG4011'), (402, 'AG4010')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23483 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23483 of IO tensor {'CrossPassTensor': ''}bfloat16 %input249|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (403, 'AG4009')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23474 of IO tensor {'CrossPassTensor': ''}bfloat16 %input250|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23479 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23479 of IO tensor {'CrossPassTensor': ''}bfloat16 %input252|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (147, 'AG4008'), (148, 'AG4007'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23475 of IO tensor {'CrossPassTensor': ''}bfloat16 %input254|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(147, 'AG4008'), (191, 'AG3731'), (148, 'AG4007'), (149, 'AG4006'), (240, 'AG4005'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23488 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(401, 'AG4014'), (206, 'AG4016'), (306, 'AG4015')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23488 of IO tensor {'CrossPassTensor': ''}bfloat16 %input255|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(54, 'AG4020'), (18, 'AG4017'), (305, 'AG4019'), (400, 'AG4018')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23487 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23487 of IO tensor {'CrossPassTensor': ''}bfloat16 %input256|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23485 of IO tensor {'CrossPassTensor': ''}bfloat16 %input257|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23486 of IO tensor {'CrossPassTensor': ''}bfloat16 %input258|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(18, 'AG4017'), (206, 'AG4016'), (191, 'AG3731'), (306, 'AG4015'), (401, 'AG4014')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23499 of IO tensor {'CrossPassTensor': ''}bfloat16 %input259|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(150, 'AG4029'), (237, 'AG4028'), (151, 'AG4024'), (152, 'AG4023'), (153, 'AG4022'), (304, 'AG4027'), (398, 'AG4026')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23498 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23498 of IO tensor {'CrossPassTensor': ''}bfloat16 %input260|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (399, 'AG4025')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23489 of IO tensor {'CrossPassTensor': ''}bfloat16 %input261|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23494 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23494 of IO tensor {'CrossPassTensor': ''}bfloat16 %input263|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (151, 'AG4024'), (152, 'AG4023'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23490 of IO tensor {'CrossPassTensor': ''}bfloat16 %input265|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(151, 'AG4024'), (191, 'AG3731'), (152, 'AG4023'), (153, 'AG4022'), (238, 'AG4021'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23503 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(397, 'AG4030'), (207, 'AG4032'), (303, 'AG4031')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23503 of IO tensor {'CrossPassTensor': ''}bfloat16 %input266|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(55, 'AG4036'), (19, 'AG4033'), (302, 'AG4035'), (396, 'AG4034')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23502 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23502 of IO tensor {'CrossPassTensor': ''}bfloat16 %input267|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23500 of IO tensor {'CrossPassTensor': ''}bfloat16 %input268|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23501 of IO tensor {'CrossPassTensor': ''}bfloat16 %input269|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(19, 'AG4033'), (207, 'AG4032'), (191, 'AG3731'), (303, 'AG4031'), (397, 'AG4030')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23514 of IO tensor {'CrossPassTensor': ''}bfloat16 %input270|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(154, 'AG4045'), (235, 'AG4044'), (155, 'AG4040'), (156, 'AG4039'), (157, 'AG4038'), (301, 'AG4043'), (394, 'AG4042')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23513 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23513 of IO tensor {'CrossPassTensor': ''}bfloat16 %input271|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (395, 'AG4041')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23504 of IO tensor {'CrossPassTensor': ''}bfloat16 %input272|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23509 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23509 of IO tensor {'CrossPassTensor': ''}bfloat16 %input274|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (155, 'AG4040'), (156, 'AG4039'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23505 of IO tensor {'CrossPassTensor': ''}bfloat16 %input276|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(155, 'AG4040'), (191, 'AG3731'), (156, 'AG4039'), (157, 'AG4038'), (236, 'AG4037'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23518 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(393, 'AG4046'), (208, 'AG4048'), (300, 'AG4047')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23518 of IO tensor {'CrossPassTensor': ''}bfloat16 %input277|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(56, 'AG4052'), (20, 'AG4049'), (299, 'AG4051'), (392, 'AG4050')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23517 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23517 of IO tensor {'CrossPassTensor': ''}bfloat16 %input278|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23515 of IO tensor {'CrossPassTensor': ''}bfloat16 %input279|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23516 of IO tensor {'CrossPassTensor': ''}bfloat16 %input280|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(20, 'AG4049'), (208, 'AG4048'), (191, 'AG3731'), (300, 'AG4047'), (393, 'AG4046')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23529 of IO tensor {'CrossPassTensor': ''}bfloat16 %input281|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(158, 'AG4061'), (233, 'AG4060'), (159, 'AG4056'), (160, 'AG4055'), (161, 'AG4054'), (298, 'AG4059'), (390, 'AG4058')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23528 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23528 of IO tensor {'CrossPassTensor': ''}bfloat16 %input282|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (391, 'AG4057')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23519 of IO tensor {'CrossPassTensor': ''}bfloat16 %input283|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23524 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23524 of IO tensor {'CrossPassTensor': ''}bfloat16 %input285|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (159, 'AG4056'), (160, 'AG4055'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23520 of IO tensor {'CrossPassTensor': ''}bfloat16 %input287|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(159, 'AG4056'), (191, 'AG3731'), (160, 'AG4055'), (161, 'AG4054'), (234, 'AG4053'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23533 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(389, 'AG4062'), (209, 'AG4064'), (297, 'AG4063')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23533 of IO tensor {'CrossPassTensor': ''}bfloat16 %input288|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(57, 'AG4068'), (21, 'AG4065'), (296, 'AG4067'), (388, 'AG4066')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23532 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23532 of IO tensor {'CrossPassTensor': ''}bfloat16 %input289|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23530 of IO tensor {'CrossPassTensor': ''}bfloat16 %input290|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23531 of IO tensor {'CrossPassTensor': ''}bfloat16 %input291|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(21, 'AG4065'), (209, 'AG4064'), (191, 'AG3731'), (297, 'AG4063'), (389, 'AG4062')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23544 of IO tensor {'CrossPassTensor': ''}bfloat16 %input292|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(162, 'AG4077'), (231, 'AG4076'), (163, 'AG4072'), (164, 'AG4071'), (165, 'AG4070'), (295, 'AG4075'), (386, 'AG4074')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23543 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23543 of IO tensor {'CrossPassTensor': ''}bfloat16 %input293|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (387, 'AG4073')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23534 of IO tensor {'CrossPassTensor': ''}bfloat16 %input294|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23539 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23539 of IO tensor {'CrossPassTensor': ''}bfloat16 %input296|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (163, 'AG4072'), (164, 'AG4071'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23535 of IO tensor {'CrossPassTensor': ''}bfloat16 %input298|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(163, 'AG4072'), (191, 'AG3731'), (164, 'AG4071'), (165, 'AG4070'), (232, 'AG4069'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23548 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(385, 'AG4078'), (210, 'AG4080'), (294, 'AG4079')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23548 of IO tensor {'CrossPassTensor': ''}bfloat16 %input299|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(58, 'AG4084'), (22, 'AG4081'), (293, 'AG4083'), (384, 'AG4082')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23547 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23547 of IO tensor {'CrossPassTensor': ''}bfloat16 %input300|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23545 of IO tensor {'CrossPassTensor': ''}bfloat16 %input301|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23546 of IO tensor {'CrossPassTensor': ''}bfloat16 %input302|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(22, 'AG4081'), (210, 'AG4080'), (191, 'AG3731'), (294, 'AG4079'), (385, 'AG4078')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23559 of IO tensor {'CrossPassTensor': ''}bfloat16 %input303|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(166, 'AG4093'), (229, 'AG4092'), (167, 'AG4088'), (168, 'AG4087'), (169, 'AG4086'), (292, 'AG4091'), (382, 'AG4090')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23558 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23558 of IO tensor {'CrossPassTensor': ''}bfloat16 %input304|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (383, 'AG4089')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23549 of IO tensor {'CrossPassTensor': ''}bfloat16 %input305|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23554 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23554 of IO tensor {'CrossPassTensor': ''}bfloat16 %input307|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (167, 'AG4088'), (168, 'AG4087'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23550 of IO tensor {'CrossPassTensor': ''}bfloat16 %input309|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(167, 'AG4088'), (191, 'AG3731'), (168, 'AG4087'), (169, 'AG4086'), (230, 'AG4085'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(381, 'AG4094'), (211, 'AG4096'), (291, 'AG4095')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23563 of IO tensor {'CrossPassTensor': ''}bfloat16 %input310|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(59, 'AG4100'), (23, 'AG4097'), (290, 'AG4099'), (380, 'AG4098')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23562 of IO tensor {'CrossPassTensor': ''}bfloat16 %input311|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23560 of IO tensor {'CrossPassTensor': ''}bfloat16 %input312|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23561 of IO tensor {'CrossPassTensor': ''}bfloat16 %input313|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(23, 'AG4097'), (211, 'AG4096'), (191, 'AG3731'), (291, 'AG4095'), (381, 'AG4094')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23574 of IO tensor {'CrossPassTensor': ''}bfloat16 %input314|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(170, 'AG4109'), (227, 'AG4108'), (171, 'AG4104'), (172, 'AG4103'), (173, 'AG4102'), (289, 'AG4107'), (378, 'AG4106')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23573 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23573 of IO tensor {'CrossPassTensor': ''}bfloat16 %input315|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (379, 'AG4105')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23564 of IO tensor {'CrossPassTensor': ''}bfloat16 %input316|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23569 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23569 of IO tensor {'CrossPassTensor': ''}bfloat16 %input318|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (171, 'AG4104'), (172, 'AG4103'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23565 of IO tensor {'CrossPassTensor': ''}bfloat16 %input320|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(171, 'AG4104'), (191, 'AG3731'), (172, 'AG4103'), (173, 'AG4102'), (228, 'AG4101'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23578 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(377, 'AG4110'), (212, 'AG4112'), (288, 'AG4111')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23578 of IO tensor {'CrossPassTensor': ''}bfloat16 %input321|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(60, 'AG4116'), (24, 'AG4113'), (287, 'AG4115'), (376, 'AG4114')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23577 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23577 of IO tensor {'CrossPassTensor': ''}bfloat16 %input322|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23575 of IO tensor {'CrossPassTensor': ''}bfloat16 %input323|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23576 of IO tensor {'CrossPassTensor': ''}bfloat16 %input324|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(24, 'AG4113'), (212, 'AG4112'), (191, 'AG3731'), (288, 'AG4111'), (377, 'AG4110')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23589 of IO tensor {'CrossPassTensor': ''}bfloat16 %input325|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(174, 'AG4125'), (225, 'AG4124'), (175, 'AG4120'), (176, 'AG4119'), (177, 'AG4118'), (286, 'AG4123'), (374, 'AG4122')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23588 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23588 of IO tensor {'CrossPassTensor': ''}bfloat16 %input326|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (375, 'AG4121')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23579 of IO tensor {'CrossPassTensor': ''}bfloat16 %input327|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23584 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23584 of IO tensor {'CrossPassTensor': ''}bfloat16 %input329|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (175, 'AG4120'), (176, 'AG4119'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23580 of IO tensor {'CrossPassTensor': ''}bfloat16 %input331|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(175, 'AG4120'), (191, 'AG3731'), (176, 'AG4119'), (177, 'AG4118'), (226, 'AG4117'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(373, 'AG4126'), (213, 'AG4128'), (285, 'AG4127')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23593 of IO tensor {'CrossPassTensor': ''}bfloat16 %input332|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(61, 'AG4132'), (25, 'AG4129'), (284, 'AG4131'), (372, 'AG4130')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23592 of IO tensor {'CrossPassTensor': ''}bfloat16 %input333|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23590 of IO tensor {'CrossPassTensor': ''}bfloat16 %input334|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23591 of IO tensor {'CrossPassTensor': ''}bfloat16 %input335|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(25, 'AG4129'), (213, 'AG4128'), (191, 'AG3731'), (285, 'AG4127'), (373, 'AG4126')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23604 of IO tensor {'CrossPassTensor': ''}bfloat16 %input336|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(178, 'AG4141'), (223, 'AG4140'), (179, 'AG4136'), (180, 'AG4135'), (181, 'AG4134'), (283, 'AG4139'), (370, 'AG4138')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23603 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23603 of IO tensor {'CrossPassTensor': ''}bfloat16 %input337|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (371, 'AG4137')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23594 of IO tensor {'CrossPassTensor': ''}bfloat16 %input338|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23599 of IO tensor {'CrossPassTensor': ''}bfloat16 %input340|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (179, 'AG4136'), (180, 'AG4135'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23595 of IO tensor {'CrossPassTensor': ''}bfloat16 %input342|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(179, 'AG4136'), (191, 'AG3731'), (180, 'AG4135'), (181, 'AG4134'), (224, 'AG4133'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23608 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(369, 'AG4142'), (214, 'AG4144'), (282, 'AG4143')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23608 of IO tensor {'CrossPassTensor': ''}bfloat16 %input343|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(62, 'AG4148'), (26, 'AG4145'), (281, 'AG4147'), (368, 'AG4146')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23607 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23607 of IO tensor {'CrossPassTensor': ''}bfloat16 %input344|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23605 of IO tensor {'CrossPassTensor': ''}bfloat16 %input345|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23606 of IO tensor {'CrossPassTensor': ''}bfloat16 %input346|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(26, 'AG4145'), (214, 'AG4144'), (191, 'AG3731'), (282, 'AG4143'), (369, 'AG4142')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23619 of IO tensor {'CrossPassTensor': ''}bfloat16 %input347|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(182, 'AG4157'), (221, 'AG4156'), (183, 'AG4152'), (184, 'AG4151'), (185, 'AG4150'), (280, 'AG4155'), (366, 'AG4154')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23618 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23618 of IO tensor {'CrossPassTensor': ''}bfloat16 %input348|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (367, 'AG4153')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23609 of IO tensor {'CrossPassTensor': ''}bfloat16 %input349|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23614 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23614 of IO tensor {'CrossPassTensor': ''}bfloat16 %input351|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (183, 'AG4152'), (184, 'AG4151'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23610 of IO tensor {'CrossPassTensor': ''}bfloat16 %input353|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(183, 'AG4152'), (191, 'AG3731'), (184, 'AG4151'), (185, 'AG4150'), (222, 'AG4149'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23623 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(365, 'AG4158'), (215, 'AG4160'), (279, 'AG4159')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23623 of IO tensor {'CrossPassTensor': ''}bfloat16 %input354|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(63, 'AG4164'), (27, 'AG4161'), (278, 'AG4163'), (364, 'AG4162')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23622 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23622 of IO tensor {'CrossPassTensor': ''}bfloat16 %input355|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23620 of IO tensor {'CrossPassTensor': ''}bfloat16 %input356|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23621 of IO tensor {'CrossPassTensor': ''}bfloat16 %input357|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(27, 'AG4161'), (215, 'AG4160'), (191, 'AG3731'), (279, 'AG4159'), (365, 'AG4158')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23634 of IO tensor {'CrossPassTensor': ''}bfloat16 %input358|NC|(2, 2, 128, 2, 2, 2, 4, 128) is not sorted, index list (w/ AG ids): [(186, 'AG4173'), (219, 'AG4172'), (187, 'AG4168'), (188, 'AG4167'), (189, 'AG4166'), (277, 'AG4171'), (362, 'AG4170')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23633 of IO tensor {'CrossPassTensor': ''}bfloat16 %input359|NC|(2, 128, 8, 2, 2, 128) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (363, 'AG4169')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23624 of IO tensor {'CrossPassTensor': ''}bfloat16 %input360|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23629 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23629 of IO tensor {'CrossPassTensor': ''}bfloat16 %input362|NC|(2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (187, 'AG4168'), (188, 'AG4167'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23625 of IO tensor {'CrossPassTensor': ''}bfloat16 %input364|NC|(2, 2, 128, 8, 2, 2, 2, 64) is not sorted, index list (w/ AG ids): [(187, 'AG4168'), (191, 'AG3731'), (188, 'AG4167'), (189, 'AG4166'), (220, 'AG4165'), (472, 'AG3732')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(361, 'AG4174'), (216, 'AG4176'), (276, 'AG4175')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23638 of IO tensor {'CrossPassTensor': ''}bfloat16 %input365|NC|(8, 2, 128, 6, 2, 2, 128) is not sorted, index list (w/ AG ids): [(64, 'AG4180'), (28, 'AG4177'), (275, 'AG4179'), (360, 'AG4178')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23637 of IO tensor {'CrossPassTensor': ''}bfloat16 %input366|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23635 of IO tensor {'CrossPassTensor': ''}bfloat16 %input367|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23636 of IO tensor {'CrossPassTensor': ''}bfloat16 %input368|NC|(2, 6, 128, 2, 8, 2, 128) is not sorted, index list (w/ AG ids): [(28, 'AG4177'), (216, 'AG4176'), (191, 'AG3731'), (276, 'AG4175'), (361, 'AG4174')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23171 of IO tensor {'CrossPassTensor': ''}bfloat16 %input369|NC|(2, 37984, 2, 8, 128) is not sorted, index list (w/ AG ids): [(190, 'AG4182'), (217, 'AG4181'), (191, 'AG3731')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: P dims of loadstore 23639 of IO tensor {'CrossPassTensor': ''}bfloat16 %input370|N|(128, 2, 8) is not sorted, index list (w/ AG ids): [(473, 'AG3729'), (359, 'AG3730')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23217 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23218 of IO tensor non_local bfloat16 %all_gather.1(2, 8, 128, 8) is not sorted, index list (w/ AG ids): [(191, 'AG3731'), (77, 'AG3728')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16509 of IO tensor non_local float32 %get_tuple_element.3(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 23193 of IO tensor non_local uint32 %get_tuple_element.4(8, 2, 128) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16484 of IO tensor non_local int32 %gather.2|NC|(8, 256) is not sorted, index list (w/ AG ids): [(69, 'AG4189'), (32, 'AG4193')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16527 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: WARNING: non P dims of loadstore 16544 of IO tensor non_local float32 %get_tuple_element.5(8, 2, 128) is not sorted, index list (w/ AG ids): [(72, 'AG4196'), (33, 'AG4195')] +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/AGOrderingAnalysisPass]: AGOrderingAnalysisPass finished after 2.577 seconds +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Running StaticTransposeLocalTensor +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/StaticTransposeLocalTensor]: Finished (changed=True) +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/StaticTransposeLocalTensor]: StaticTransposeLocalTensor finished after 0.248 seconds +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/PComputeCutting]: Running PComputeCutting +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/PComputeCutting]: Finished (changed=True) +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/PComputeCutting]: PComputeCutting finished after 0.319 seconds +2025-11-04T21:39:46Z INFO 8881 [sg0000/Tensorizer/BFComputeCutting]: Running BFComputeCutting +2025-11-04T21:39:47Z INFO 8881 [sg0000/Tensorizer/BFComputeCutting]: Finished (changed=True) +2025-11-04T21:39:47Z INFO 8881 [sg0000/Tensorizer/BFComputeCutting]: BFComputeCutting finished after 0.416 seconds +2025-11-04T21:39:47Z INFO 8881 [sg0000/Tensorizer/LoopSplitting]: Running LoopSplitting +2025-11-04T21:39:47Z INFO 8881 [sg0000/Tensorizer/LoopSplitting]: Finished (changed=False) +2025-11-04T21:39:47Z INFO 8881 [sg0000/Tensorizer/LoopSplitting]: LoopSplitting finished after 0.031 seconds +2025-11-04T21:39:47Z INFO 8881 [sg0000/Tensorizer/MacroGeneration]: Running MacroGeneration +2025-11-04T21:39:49Z INFO 8881 [sg0000/Tensorizer/MacroGeneration]: Finished (changed=True) +2025-11-04T21:39:49Z INFO 8881 [sg0000/Tensorizer/MacroGeneration]: MacroGeneration finished after 2.382 seconds +2025-11-04T21:39:49Z INFO 8881 [sg0000/Tensorizer/PGTiling]: PGTiling finished after 6.016 seconds +2025-11-04T21:39:49Z INFO 8881 [sg0000/Tensorizer/InsertIOTransposes]: Running InsertIOTransposes +2025-11-04T21:39:50Z INFO 8881 [sg0000/Tensorizer/InsertIOTransposes]: Finished (changed=True) +2025-11-04T21:39:50Z INFO 8881 [sg0000/Tensorizer/InsertIOTransposes]: InsertIOTransposes finished after 0.877 seconds +2025-11-04T21:39:50Z INFO 8881 [sg0000/Tensorizer/InsertOffloadedTransposes]: Running InsertOffloadedTransposes +2025-11-04T21:39:50Z INFO 8881 [sg0000/Tensorizer/InsertOffloadedTransposes]: OffloadedTranspose inserted: 0 +2025-11-04T21:39:50Z INFO 8881 [sg0000/Tensorizer/InsertOffloadedTransposes]: Finished (changed=False) +2025-11-04T21:39:50Z INFO 8881 [sg0000/Tensorizer/InsertOffloadedTransposes]: InsertOffloadedTransposes finished after 0.133 seconds +2025-11-04T21:39:50Z INFO 8881 [sg0000/Tensorizer/DramToDramTranspose]: Running DramToDramTranspose +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/DramToDramTranspose]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/DramToDramTranspose]: DramToDramTranspose finished after 0.341 seconds +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/PGLayoutTilingPipeline]: PGLayoutTilingPipeline finished after 41.049 seconds +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingProfiler]: Running TilingProfiler +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingProfiler]: Finished (changed=False) +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/TilingProfiler]: TilingProfiler finished after 0.478 seconds +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.318 seconds +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor +2025-11-04T21:39:51Z INFO 8881 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_0 +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_0 finished after 1.645 seconds +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/InferNeuronTensor]: Running InferNeuronTensor_iteration_1 +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor_iteration_1 finished after 0.069 seconds +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/InferNeuronTensor]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/InferNeuronTensor]: InferNeuronTensor finished after 1.716 seconds +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.311 seconds +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.311 seconds +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/LICM]: LICM finished after 0.086 seconds +2025-11-04T21:39:53Z INFO 8881 [sg0000/Tensorizer/RewriteReplicationMatmul]: Running RewriteReplicationMatmul +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/RewriteReplicationMatmul]: Finished (changed=False) +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/RewriteReplicationMatmul]: RewriteReplicationMatmul finished after 0.046 seconds +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.185 seconds +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.121 seconds +2025-11-04T21:39:54Z INFO 8881 [sg0000/Tensorizer/DataLocalityOpt]: Running DataLocalityOpt +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/DataLocalityOpt]: Finished (changed=True) +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/DataLocalityOpt]: DataLocalityOpt finished after 3.235 seconds +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/DMATilingProfiler]: Running DMATilingProfiler +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: +20 MACROS WITH LARGEST INSTRUCTION COUNTS: +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: transpose_128x128 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 9504: matmul_128x128x8 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: transpose_128x64 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/PostDLOTilingBottleneck]: 2048: matmul_64x128x2 +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/DMATilingProfiler]: Finished (changed=False) +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/DMATilingProfiler]: DMATilingProfiler finished after 0.096 seconds +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:57Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:58Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.355 seconds +2025-11-04T21:39:58Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:39:58Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.356 seconds +2025-11-04T21:39:58Z INFO 8881 [sg0000/Tensorizer/LegalizeSundaMacro]: Running LegalizeSundaMacro +2025-11-04T21:39:58Z INFO 8881 [sg0000/Tensorizer/LegalizeSundaMacro]: Finished (changed=True) +2025-11-04T21:39:58Z INFO 8881 [sg0000/Tensorizer/LegalizeSundaMacro]: LegalizeSundaMacro finished after 0.700 seconds +2025-11-04T21:39:58Z INFO 8881 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Running InsertImplicitShardAxisBeforeISel +2025-11-04T21:39:59Z INFO 8881 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: Finished (changed=True) +2025-11-04T21:39:59Z INFO 8881 [sg0000/Tensorizer/InsertImplicitShardAxisBeforeISel]: InsertImplicitShardAxisBeforeISel finished after 0.414 seconds +2025-11-04T21:39:59Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:39:59Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:39:59Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.516 seconds +2025-11-04T21:39:59Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_1 +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_1 finished after 0.517 seconds +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 1.034 seconds +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/PerfectLoopNest]: Running PerfectLoopNest +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/PerfectLoopNest]: Finished (changed=False) +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/PerfectLoopNest]: PerfectLoopNest finished after 0.072 seconds +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.223 seconds +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/RewriteWeights]: Running RewriteWeights +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/RewriteWeights]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/RewriteWeights]: RewriteWeights finished after 0.088 seconds +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/ReshapeWeights]: Running ReshapeWeights +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/ReshapeWeights]: Finished (changed=True) +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/ReshapeWeights]: ReshapeWeights finished after 0.032 seconds +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Running FlattenMacroLoop +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: Finished (changed=False) +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/FlattenMacroLoop]: FlattenMacroLoop finished after 0.101 seconds +2025-11-04T21:40:00Z INFO 8881 [sg0000/Tensorizer/SimplifyMacroPredicates]: Running SimplifyMacroPredicates +2025-11-04T21:40:01Z INFO 8881 [sg0000/Tensorizer/SimplifyMacroPredicates]: Finished (changed=True) +2025-11-04T21:40:01Z INFO 8881 [sg0000/Tensorizer/SimplifyMacroPredicates]: SimplifyMacroPredicates finished after 0.281 seconds +2025-11-04T21:40:01Z INFO 8881 [sg0000/Tensorizer/InferInitValue]: Running InferInitValue +2025-11-04T21:40:02Z INFO 8881 [sg0000/Tensorizer/InferInitValue]: Finished (changed=True) +2025-11-04T21:40:02Z INFO 8881 [sg0000/Tensorizer/InferInitValue]: InferInitValue finished after 1.704 seconds +2025-11-04T21:40:02Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier +2025-11-04T21:40:02Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Running NeuronSimplifier_iteration_0 +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier_iteration_0 finished after 0.504 seconds +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: Finished (changed=False) +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifier]: NeuronSimplifier finished after 0.505 seconds +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/SimplifyTensor]: Running SimplifyTensor +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/SimplifyTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/SimplifyTensor]: DeadCodeElimination_iteration_0 finished after 0.076 seconds +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/SimplifyTensor]: Finished (changed=False) +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/SimplifyTensor]: SimplifyTensor finished after 0.277 seconds +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/LICM]: Running LICM +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/LICM]: Finished (changed=True) +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/LICM]: LICM finished after 0.121 seconds +2025-11-04T21:40:03Z INFO 8881 [sg0000/Tensorizer/SundaISel]: Running SundaISel +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/SundaISel]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/SundaISel]: SundaISel finished after 1.404 seconds +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronAliasDependencyReset]: Running NeuronAliasDependencyReset +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: Running AliasDependencyElimination +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/AliasDependencyElimination]: AliasDependencyElimination finished after 0.005 seconds +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Running NeuronAliasDependencyInduction +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronAliasDependencyInduction]: NeuronAliasDependencyInduction finished after 0.022 seconds +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronAliasDependencyReset]: NeuronAliasDependencyReset finished after 0.039 seconds +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/LowerComplexBroadcast]: Running LowerComplexBroadcast +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/LowerComplexBroadcast]: Finished (changed=False) +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/LowerComplexBroadcast]: LowerComplexBroadcast finished after 0.087 seconds +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.081 seconds +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=True) +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.348 seconds +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion +2025-11-04T21:40:05Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_0 +2025-11-04T21:40:06Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_0 finished after 0.554 seconds +2025-11-04T21:40:06Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_1 +2025-11-04T21:40:06Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_1 finished after 0.273 seconds +2025-11-04T21:40:06Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_2 +2025-11-04T21:40:06Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_2 finished after 0.247 seconds +2025-11-04T21:40:06Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_3 +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_3 finished after 0.247 seconds +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: Running NeuronLoopFusion_iteration_4 +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion_iteration_4 finished after 0.249 seconds +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: Finished (changed=True) +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopFusion]: NeuronLoopFusion finished after 1.573 seconds +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopInterchange]: Running NeuronLoopInterchange +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopInterchange]: Finished (changed=False) +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLoopInterchange]: NeuronLoopInterchange finished after 0.066 seconds +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.355 seconds +2025-11-04T21:40:07Z INFO 8881 [sg0000/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:08Z INFO 8881 [sg0000/Tensorizer/FactorizeBlkDims]: Finished (changed=True) +2025-11-04T21:40:08Z INFO 8881 [sg0000/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.604 seconds +2025-11-04T21:40:08Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:08Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 1.039 seconds +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.203 seconds +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_2 +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_2 finished after 0.183 seconds +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 1.436 seconds +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.126 seconds +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:09Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.203 seconds +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.191 seconds +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.402 seconds +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.092 seconds +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA +2025-11-04T21:40:10Z INFO 8881 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_0 +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_0 finished after 0.628 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/VectorizeDMA]: Running VectorizeDMA_iteration_1 +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA_iteration_1 finished after 0.051 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/VectorizeDMA]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/VectorizeDMA]: VectorizeDMA finished after 0.681 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.269 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/LegalizePartitionReduce]: Running LegalizePartitionReduce +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/LegalizePartitionReduce]: Finished (changed=True) +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/LegalizePartitionReduce]: LegalizePartitionReduce finished after 0.091 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/DeConcat]: Running DeConcat +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/DeConcat]: Running DeConcat_iteration_0 +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/DeConcat]: DeConcat_iteration_0 finished after 0.062 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/DeConcat]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/DeConcat]: DeConcat finished after 0.063 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Running FactorizeThreadAxesInFreeDims +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: Finished (changed=False) +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/FactorizeThreadAxesInFreeDims]: FactorizeThreadAxesInFreeDims finished after 0.090 seconds +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion +2025-11-04T21:40:11Z INFO 8881 [sg0000/Tensorizer/PartialSimdFusion]: Running PartialSimdFusion_iteration_0 +2025-11-04T21:40:12Z INFO 8881 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion_iteration_0 finished after 0.836 seconds +2025-11-04T21:40:12Z INFO 8881 [sg0000/Tensorizer/PartialSimdFusion]: Finished (changed=True) +2025-11-04T21:40:12Z INFO 8881 [sg0000/Tensorizer/PartialSimdFusion]: PartialSimdFusion finished after 0.836 seconds +2025-11-04T21:40:12Z INFO 8881 [sg0000/Tensorizer/TritiumFusion]: Running TritiumFusion +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/TritiumFusion]: Finished (changed=True) +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/TritiumFusion]: TritiumFusion finished after 1.632 seconds +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.574 seconds +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.575 seconds +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/VectorizeMatMult]: Running VectorizeMatMult +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/VectorizeMatMult]: Finished (changed=False) +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/VectorizeMatMult]: VectorizeMatMult finished after 0.081 seconds +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion +2025-11-04T21:40:14Z INFO 8881 [sg0000/Tensorizer/PartialLoopFusion]: Running PartialLoopFusion_iteration_0 +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion_iteration_0 finished after 1.253 seconds +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/PartialLoopFusion]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/PartialLoopFusion]: PartialLoopFusion finished after 1.253 seconds +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.193 seconds +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/LowerTranspose]: Finished (changed=True) +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.489 seconds +2025-11-04T21:40:16Z INFO 8881 [sg0000/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:17Z INFO 8881 [sg0000/Tensorizer/LowerBroadcast]: Finished (changed=True) +2025-11-04T21:40:17Z INFO 8881 [sg0000/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.545 seconds +2025-11-04T21:40:17Z INFO 8881 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:17Z INFO 8881 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.963 seconds +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_1 +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_1 finished after 0.181 seconds +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/LateNeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 1.152 seconds +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/SplitAccGrp]: Running SplitAccGrp +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/SplitAccGrp]: Finished (changed=False) +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/SplitAccGrp]: SplitAccGrp finished after 0.049 seconds +2025-11-04T21:40:18Z INFO 8881 [sg0000/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:19Z INFO 8881 [sg0000/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:19Z INFO 8881 [sg0000/Tensorizer/SpillPSum]: SpillPSum finished after 0.710 seconds +2025-11-04T21:40:19Z INFO 8881 [sg0000/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/LowerIntrinsics]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.932 seconds +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/InlineNativeKernels]: Running InlineNativeKernels +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/InlineNativeKernels]: Finished (changed=False) +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/InlineNativeKernels]: InlineNativeKernels finished after 0.057 seconds +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/LegalizeType]: LegalizeType finished after 0.140 seconds +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: Finished (changed=True) +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.278 seconds +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:20Z INFO 8881 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:21Z INFO 8881 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.754 seconds +2025-11-04T21:40:21Z INFO 8881 [sg0000/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_1 +2025-11-04T21:40:22Z INFO 8881 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_1 finished after 0.661 seconds +2025-11-04T21:40:22Z INFO 8881 [sg0000/Tensorizer/InferPSumTensor]: Finished (changed=True) +2025-11-04T21:40:22Z INFO 8881 [sg0000/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 1.417 seconds +2025-11-04T21:40:22Z INFO 8881 [sg0000/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:22Z INFO 8881 [sg0000/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:22Z INFO 8881 [sg0000/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.063 seconds +2025-11-04T21:40:22Z INFO 8881 [sg0000/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.966 seconds +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/RelaxPredicates]: Running RelaxPredicates +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/RelaxPredicates]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/RelaxPredicates]: RelaxPredicates finished after 0.119 seconds +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/TensorInitialization]: Running TensorInitialization +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/TensorInitialization]: Finished (changed=True) +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/TensorInitialization]: TensorInitialization finished after 0.225 seconds +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.278 seconds +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.089 seconds +2025-11-04T21:40:23Z INFO 8881 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.039 seconds +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.363 seconds +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/DMALocalityOpt]: Finished (changed=True) +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.042 seconds +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/DataStreaming]: Finished (changed=True) +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/DataStreaming]: DataStreaming finished after 0.168 seconds +2025-11-04T21:40:24Z INFO 8881 [sg0000/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:28Z INFO 8881 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:30Z INFO 8881 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 2.250 seconds +2025-11-04T21:40:30Z INFO 8881 [sg0000/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_1 +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_1 finished after 0.250 seconds +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 6.720 seconds +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/LateLegalizeInst]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.150 seconds +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/CoalesceCCOp]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.175 seconds +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.067 seconds +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/InsertCoreBarrier]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.131 seconds +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 1.523ms (300.000MiB, est bw: 206.549GB/s, 8.430% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[2] bfloat16 (2, 297, 128, 2048) %'38153.56437'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i0.128,i1.128+128i2.16] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (2, 37984, 16, 128) %'input369'[i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1,i2.16,i1.128] # id=56436, src_id=None, , instances=600 # dl = tensor_op_name: input369_pftranspose_38153 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 230.909us (2.344MiB, est bw: 10.643GB/s, 1.278% of tot. time) for float32<8 x 128> non_local float32 (8, 2, 37984) %'convert.656'[i1.8,i4894_0,i0.128+512i4894_1_0_0+128i4894_1_0_1] = store float32<8 x 128> TongaSB partitions[2] float32 (2, 297, 8, 128) %'38666.56447'[i4894_0,4i4894_1_0_0+i4894_1_0_1,i1.8,i0.128] # id=56445, src_id=None, , instances=600 # dl = tensor_op_name: convert.656_pftranspose_38666 | hlo_id: 16522 | if -i0.128-512i4894_1_0_0-128i4894_1_0_1+37983 >= 0 and -4i4894_1_0_0-i4894_1_0_1+296 >= 0 [[i1.8];[i0.128]] -> [[i1.8];[i0.128]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input5_local_39959'[i160_25181_0_42928,2i183_0_0+i183_0_1,i183_1_0_0,i183_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input5'[2i183_0_0+i183_0_1,i183_1_0_0,i183_1_0_1,i160_25181_0_42928,i2.16,i0.128,i1.128] # id=48172, src_id=None, , instances=64 # dl = tensor_op_name: _dot.354 | hlo_id: 13489 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input7_local_40070'[i359_25182_0_43002,2i382_0_0+i382_0_1,i382_1_0_0,i382_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input7'[2i382_0_0+i382_0_1,i382_1_0_0,i382_1_0_1,i359_25182_0_43002,i2.16,i0.128,i1.128] # id=48372, src_id=None, , instances=64 # dl = tensor_op_name: _dot.698 | hlo_id: 13600 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input9_local_40173'[i531_25183_0_43076,2i554_0_0+i554_0_1,i554_1_0_0,i554_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input9'[2i554_0_0+i554_0_1,i554_1_0_0,i554_1_0_1,i531_25183_0_43076,i2.16,i0.128,i1.128] # id=48568, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1042 | hlo_id: 13711 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input11_local_40276'[i703_25184_0_43150,2i726_0_0+i726_0_1,i726_1_0_0,i726_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input11'[2i726_0_0+i726_0_1,i726_1_0_0,i726_1_0_1,i703_25184_0_43150,i2.16,i0.128,i1.128] # id=48764, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1386 | hlo_id: 13822 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input13_local_40379'[i875_25185_0_43224,2i898_0_0+i898_0_1,i898_1_0_0,i898_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input13'[2i898_0_0+i898_0_1,i898_1_0_0,i898_1_0_1,i875_25185_0_43224,i2.16,i0.128,i1.128] # id=48960, src_id=None, , instances=64 # dl = tensor_op_name: _dot.1730 | hlo_id: 13933 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input15_local_40482'[i1047_25186_0_43298,2i1070_0_0+i1070_0_1,i1070_1_0_0,i1070_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input15'[2i1070_0_0+i1070_0_1,i1070_1_0_0,i1070_1_0_1,i1047_25186_0_43298,i2.16,i0.128,i1.128] # id=49156, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2074 | hlo_id: 14044 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input17_local_40585'[i1219_25187_0_43372,2i1242_0_0+i1242_0_1,i1242_1_0_0,i1242_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input17'[2i1242_0_0+i1242_0_1,i1242_1_0_0,i1242_1_0_1,i1219_25187_0_43372,i2.16,i0.128,i1.128] # id=49352, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2418 | hlo_id: 14155 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Est. DMA time: 163.614us (32.000MiB, est bw: 205.083GB/s, 0.906% of tot. time) for bfloat16<128 x 2048> TongaSB partitions[4] bfloat16 (2, 8, 2, 2, 128, 16, 128) %'input19_local_40688'[i1391_25188_0_43446,2i1414_0_0+i1414_0_1,i1414_1_0_0,i1414_1_0_1,i0.128,i2.16,i1.128] = load bfloat16<128 x 2048> {'CrossPassTensor': ''}bfloat16 (8, 2, 2, 2, 16, 128, 128) %'input19'[2i1414_0_0+i1414_0_1,i1414_1_0_0,i1414_1_0_1,i1391_25188_0_43446,i2.16,i0.128,i1.128] # id=49548, src_id=None, , instances=64 # dl = tensor_op_name: _dot.2762 | hlo_id: 14266 | [[i0.128];[i1.128, i2.16]] -> [[i0.128];[i1.128, i2.16]] +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.081 seconds +2025-11-04T21:40:31Z INFO 8881 [sg0000/Tensorizer/OptimizeNKIKernels]: Running OptimizeNKIKernels +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.008 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.008 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.014 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:31Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.017 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.047 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.029 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 5004) %4(init=0.0)[i0.128,i1.4748] = load float32<128 x 4748> float32 (128, 4748) %6[i0.128,i1.4748] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 11.200us (2.318MiB, est bw: 217.043GB/s, 31.647% of tot. time) for float32<128 x 4748> TongaSB partitions[0] float32 (128, 4748) %10[i0.128,i1.4748] = load float32<128 x 4748> float32 (8, 75968) %'inp'[i0.128,i1.4748] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.4748]] -> [[i0.128];[i1.4748]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 5.874% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 5.842% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 4.965% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 4.608% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_1 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_1 finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.013 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SpillPSum]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SpillPSum]: SpillPSum finished after 0.012 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeType]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeType]: LegalizeType finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.006 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.007 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeSundaAccess]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.009 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.047 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DataStreaming]: DataStreaming finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.020 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.005 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.003 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.079us (64.000KiB, est bw: 31.526GB/s, 12.331% of tot. time) for float32<128 x 128> TongaSB partitions[0] float32 (128, 128) %293[i0.128,i1.128] = load float32<128 x 128> float32 (128, 128) %3[i0.128,i1.128] # id=13, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.128]] -> [[i0.128];[i1.128]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %204[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %192[i0.8,i1.256] # id=194, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 2.068us (8.000KiB, est bw: 3.962GB/s, 12.264% of tot. time) for uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %207[i0.8,i1.256] = load float32<8 x 256> float32 (128, 16) %195[i0.8,i1.256] # id=197, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 288) %4(init=0.0)[i0.128,i1.32] = load float32<128 x 32> float32 (128, 32) %6[i0.128,i1.32] # id=7, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.934us (16.000KiB, est bw: 8.471GB/s, 11.472% of tot. time) for float32<128 x 32> TongaSB partitions[0] float32 (128, 32) %10[i0.128,i1.32] = load float32<128 x 32> float32 (8, 512) %'inp'[i0.128,i1.32] # id=9, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.32]] -> [[i0.128];[i1.32]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for uint32<8 x 256> uint32 (8, 256) %'topk_indices'[i0.8,i1.256] = store uint32<8 x 256> TongaSB partitions[0] uint32 (8, 256) %'global_id_buf'(init=0.0)[i0.8,i1.256] # id=210, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.757us (8.000KiB, est bw: 4.662GB/s, 10.424% of tot. time) for float32<8 x 256> float32 (8, 256) %'topk_values'[i0.8,i1.256] = store float32<8 x 256> TongaSB partitions[0] float32 (8, 256) %'val_buf'(init=0.0)[i0.8,i1.256] # id=212, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.8];[i1.256]] -> [[i0.8];[i1.256]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %192[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %182[i0.128,i1.16] # id=193, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Est. DMA time: 1.631us (8.000KiB, est bw: 5.023GB/s, 9.674% of tot. time) for float32<128 x 16> float32 (128, 16) %195[i0.128,i1.16] = store float32<128 x 16> TongaSB partitions[0] float32 (128, 16) %309[i0.128,i1.16] # id=196, src_id=None, , instances=1 # dl = tensor_op_name: | /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/nki/_pre_prod_kernels/topk/topk.py:45:0 | [[i0.128];[i1.16]] -> [[i0.128];[i1.16]] +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.004 seconds +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [topk/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.002 seconds +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:32Z INFO 8881 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DoNothing]: Running DoNothing +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DoNothing]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DoNothing]: DoNothing finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/FactorizeBlkDims]: Running FactorizeBlkDims +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/FactorizeBlkDims]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/FactorizeBlkDims]: FactorizeBlkDims finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronValueNumbering]: Running NeuronValueNumbering +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronValueNumbering]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronValueNumbering]: NeuronValueNumbering finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Running NeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronInstComb]: NeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerTranspose]: Running LowerTranspose +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerTranspose]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerTranspose]: LowerTranspose finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerBroadcast]: Running LowerBroadcast +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerBroadcast]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerBroadcast]: LowerBroadcast finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: Running LateNeuronInstComb_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateNeuronInstComb]: LateNeuronInstComb finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SpillPSum]: Running SpillPSum +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SpillPSum]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SpillPSum]: SpillPSum finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerIntrinsics]: Running LowerIntrinsics +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerIntrinsics]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LowerIntrinsics]: LowerIntrinsics finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeType]: Running LegalizeType +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeType]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeType]: LegalizeType finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronLICM]: Running NeuronLICM +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronLICM]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronLICM]: NeuronLICM finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: Running InferPSumTensor_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferPSumTensor]: InferPSumTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/WeightCoalescing]: Running WeightCoalescing +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/WeightCoalescing]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/WeightCoalescing]: WeightCoalescing finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeSundaAccess]: Running LegalizeSundaAccess +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeSundaAccess]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LegalizeSundaAccess]: LegalizeSundaAccess finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Running NeuronSimplifyPredicates +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronSimplifyPredicates]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/NeuronSimplifyPredicates]: NeuronSimplifyPredicates finished after 0.002 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/ExpandISAMacro]: Running ExpandISAMacro +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/ExpandISAMacro]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/ExpandISAMacro]: ExpandISAMacro finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running SimplifyNeuronTensor +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: Running DeadCodeElimination_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: DeadCodeElimination_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimplifyNeuronTensor]: SimplifyNeuronTensor finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMALocalityOpt]: Running DMALocalityOpt +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMALocalityOpt]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMALocalityOpt]: DMALocalityOpt finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DataStreaming]: Running DataStreaming +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DataStreaming]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DataStreaming]: DataStreaming finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: Running SFKVectorizer +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: Running VectorizeLoop_iteration_0 +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: VectorizeLoop_iteration_0 finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SFKVectorizer]: SFKVectorizer finished after 0.003 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateLegalizeInst]: Running LateLegalizeInst +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateLegalizeInst]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/LateLegalizeInst]: LateLegalizeInst finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/CoalesceCCOp]: Running CoalesceCCOp +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/CoalesceCCOp]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/CoalesceCCOp]: CoalesceCCOp finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimpleAllReduceTiling]: Running SimpleAllReduceTiling +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimpleAllReduceTiling]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/SimpleAllReduceTiling]: SimpleAllReduceTiling finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InsertCoreBarrier]: Running InsertCoreBarrier +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InsertCoreBarrier]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InsertCoreBarrier]: InsertCoreBarrier finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Running DMAProfiler +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Top 10 (estimated) latency DMAs: +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 5.852us (1.000MiB, est bw: 179.191GB/s, 59.288% of tot. time) for float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %13[i0.128,i1.2048] = load float32<128 x 2048> float32 (8, 256) %'x'[i0.128,i1.2048] # id=8, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Est. DMA time: 4.018us (1.000MiB, est bw: 260.951GB/s, 40.712% of tot. time) for float32<128 x 2048> float32 (8, 256) %'y'[i0.128,i1.2048] = store float32<128 x 2048> TongaSB partitions[0] float32 (128, 2048) %11[i0.128,i1.2048] # id=10, src_id=None, , instances=1 # dl = tensor_op_name: | if -i0.128+7 >= 0 and -i1.2048+255 >= 0 [[i0.128];[i1.2048]] -> [[i0.128];[i1.2048]] +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: Finished (changed=False) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/DMAProfiler]: DMAProfiler finished after 0.001 seconds +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [cumsum/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.000 seconds +2025-11-04T21:40:33Z INFO 8881 [sg0000/Tensorizer/OptimizeNKIKernels]: Finished (changed=True) +2025-11-04T21:40:33Z INFO 8881 [sg0000/Tensorizer/OptimizeNKIKernels]: OptimizeNKIKernels finished after 1.754 seconds +2025-11-04T21:40:33Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:33Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:34Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 0.902 seconds +2025-11-04T21:40:34Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=True) +2025-11-04T21:40:34Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 0.903 seconds +2025-11-04T21:40:34Z INFO 8881 [sg0000/Tensorizer/StaticProfiler]: Running StaticProfiler +2025-11-04T21:40:34Z WARNING 8881 [sg0000/Tensorizer/StaticProfiler]: matmul-based transposes inserted by penguin takes up 89.98 percent of all matmul computation +2025-11-04T21:40:34Z INFO 8881 [sg0000/Tensorizer/StaticProfiler]: Finished (changed=False) +2025-11-04T21:40:34Z INFO 8881 [sg0000/Tensorizer/StaticProfiler]: StaticProfiler finished after 0.138 seconds +2025-11-04T21:40:34Z INFO 8881 [sg0000/Tensorizer/SplitAPUnionSets]: Running SplitAPUnionSets +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/SplitAPUnionSets]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/SplitAPUnionSets]: SplitAPUnionSets finished after 0.524 seconds +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/LateLegalizePostSplit]: Running LateLegalizePostSplit +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/LateLegalizePostSplit]: Finished (changed=False) +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/LateLegalizePostSplit]: LateLegalizePostSplit finished after 0.105 seconds +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/InferSharedMemLoc]: Running InferSharedMemLoc +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/InferSharedMemLoc]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/InferSharedMemLoc]: InferSharedMemLoc finished after 0.119 seconds +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/LowerShardAxis]: Running LowerShardAxis +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/LowerShardAxis]: Finished (changed=True) +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/LowerShardAxis]: LowerShardAxis finished after 0.236 seconds +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion +2025-11-04T21:40:35Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Running CCOpFusion_iteration_0 +2025-11-04T21:40:36Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion_iteration_0 finished after 1.236 seconds +2025-11-04T21:40:36Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: Finished (changed=False) +2025-11-04T21:40:36Z INFO 8881 [sg0000/Tensorizer/CCOpFusion]: CCOpFusion finished after 1.236 seconds +2025-11-04T21:40:36Z INFO 8881 [sg0000/Tensorizer/DumpGraphAndMetadata]: Running DumpGraphAndMetadata +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/DumpGraphAndMetadata]: Finished (changed=False) +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/DumpGraphAndMetadata]: DumpGraphAndMetadata finished after 0.140 seconds +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Running ZeroSizeTensorElimination +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/ZeroSizeTensorElimination]: Finished (changed=False) +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/ZeroSizeTensorElimination]: ZeroSizeTensorElimination finished after 0.000 seconds +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/LowerToSendRecv]: Running LowerToSendRecv +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/LowerToSendRecv]: Finished (changed=True) +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/LowerToSendRecv]: LowerToSendRecv finished after 0.168 seconds +2025-11-04T21:40:37Z INFO 8881 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:39Z INFO 8881 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:39Z INFO 8881 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 1.965 seconds +2025-11-04T21:40:41Z INFO 8881 [Tensorizer]: BirCodeGen estimate #instances=157987 in sg0000 +2025-11-04T21:40:41Z INFO 8881 [Tensorizer]: IR signature: 67ed198f41be767a8bd74be249b77bbd52002093181bdb7574f808e35bd9dd58 for nc00/sg0000/TensorizerBIR +2025-11-04T21:40:41Z INFO 8881 [sg0000/Tensorizer/BirCodeGenLoop]: Running BirCodeGenLoop +2025-11-04T21:40:43Z INFO 8881 [sg0000/Tensorizer/BirCodeGenLoop]: Finished (changed=False) +2025-11-04T21:40:43Z INFO 8881 [sg0000/Tensorizer/BirCodeGenLoop]: BirCodeGenLoop finished after 2.044 seconds +2025-11-04T21:40:45Z INFO 8881 [Tensorizer]: BirCodeGen estimate #instances=157987 in sg0000 +2025-11-04T21:40:45Z INFO 8881 [Tensorizer]: IR signature: c28bc749dec2c85c40ea54410f0aea12737654333398febcefcfceb818abc610 for nc01/sg0000/TensorizerBIR +2025-11-04T21:40:45Z INFO 8881 [Tensorizer]: Weights total number of bytes: 2810120 +2025-11-04T21:40:45Z INFO 8881 [Tensorizer]: Successfully built model. +2025-11-04T21:40:45Z USER 8881 [root/Tensorizer/Tensorizer]: Tensorizer finished after 127.479 seconds +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: End tensorization +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input0 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input1 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input2 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input3 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input4 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input5 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input6 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input7 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input8 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input9 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input10 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input11 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input12 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input13 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input14 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input15 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input16 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input17 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input18 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input19 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input20 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input21 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input22 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input23 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input24 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input25 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input26 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input27 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input28 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input29 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input30 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input31 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input32 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input33 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input34 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input35 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input36 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input37 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input38 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input39 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input40 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input41 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input42 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input43 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input44 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input45 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input46 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input47 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input48 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input49 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input50 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input51 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input52 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input53 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input54 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input55 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input56 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input57 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input58 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input59 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input60 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input61 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input62 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input63 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input64 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input65 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input66 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input67 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input68 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input69 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input70 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input71 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input72 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input73 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input74 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input75 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input76 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input77 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input78 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input79 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input80 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input81 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input82 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input83 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input84 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input85 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input86 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input87 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input88 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input89 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input90 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input91 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input92 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input93 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input94 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input95 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input96 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input97 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input98 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input99 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input100 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input101 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input102 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input103 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input104 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input105 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input106 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input107 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input108 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input109 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input110 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input111 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input112 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input113 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input114 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input115 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input116 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input117 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input118 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input119 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input120 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input121 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input122 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input123 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input124 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input125 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input126 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input127 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input128 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input129 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input130 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input131 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input132 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input133 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input134 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input135 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input136 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input137 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input138 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input139 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input140 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input141 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input142 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input143 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input144 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input145 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input146 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input147 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input148 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input149 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input150 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input151 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input152 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input153 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input154 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input155 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input156 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input157 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input158 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input159 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input160 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input161 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input162 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input163 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input164 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input165 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input166 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input167 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input168 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input169 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input170 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input171 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input172 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input173 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input174 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input175 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input176 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input177 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input178 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input179 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input180 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input181 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input182 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input183 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input184 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input185 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input186 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input187 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input188 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input189 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input190 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input191 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input192 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input193 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input194 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input195 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input196 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input197 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input198 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input199 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input200 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input201 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input202 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input203 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input204 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input205 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input206 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input207 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input208 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input209 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input210 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input211 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input212 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input213 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input214 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input215 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input216 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input217 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input218 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input219 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input220 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input221 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input222 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input223 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input224 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input225 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input226 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input227 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input228 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input229 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input230 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input231 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input232 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input233 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input234 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input235 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input236 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input237 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input238 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input239 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input240 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input241 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input242 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input243 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input244 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input245 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input246 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input247 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input248 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input249 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input250 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input251 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input252 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input253 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input254 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input255 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input256 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input257 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input258 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input259 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input260 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input261 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input262 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input263 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input264 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input265 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input266 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input267 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input268 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input269 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input270 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input271 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input272 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input273 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input274 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input275 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input276 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input277 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input278 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input279 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input280 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input281 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input282 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input283 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input284 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input285 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input286 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input287 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input288 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input289 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input290 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input291 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input292 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input293 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input294 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input295 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input296 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input297 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input298 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input299 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input300 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input301 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input302 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input303 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input304 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input305 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input306 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input307 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input308 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input309 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input310 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input311 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input312 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input313 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input314 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input315 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input316 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input317 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input318 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input319 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input320 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input321 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input322 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input323 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input324 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input325 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input326 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input327 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input328 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input329 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input330 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input331 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input332 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input333 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input334 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input335 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input336 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input337 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input338 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input339 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input340 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input341 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input342 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input343 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input344 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input345 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input346 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input347 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input348 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input349 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input350 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input351 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input352 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input353 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input354 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input355 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input356 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input357 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input358 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input359 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input360 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input361 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input362 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input363 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input364 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input365 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input366 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input367 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input368 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input369 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Network input: input370 +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: wrote bir.json +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: wrote tensor_map.json +2025-11-04T21:40:45Z INFO 8881 [job.Frontend.0]: Job #0 finished +2025-11-04T21:40:45Z INFO 8881 [pipeline.Pipeline.0]: Finished job job.Frontend.0 +2025-11-04T21:40:45Z INFO 8881 [pipeline.Pipeline.0]: Starting job job.StaticIOTranspose.0 +2025-11-04T21:40:45Z INFO 8881 [pipeline.Pipeline.0]: Finished job job.StaticIOTranspose.0 +2025-11-04T21:40:45Z INFO 8881 [pipeline.Pipeline.0]: Starting job job.WalrusDriver.0 +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: BackendDriver has 2 states with 2 core LNC +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: BackendDriver VNC cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704 +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: BackendDriver: no partitions within VNC found. Switching to VNC + flat flow. +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: BackendDriver in_state.num_states 2 with 2 core LNC +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: Executing /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/walrus_driver --optlevel 2 --allocator coloring --verbose 35 --logfile-verbose 20 --logfile /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/log-neuron-cc.txt --vnc-nc-per-sengine 2 --link-subgraphs nc00/sg00,nc01/sg00 --execute-repetition 1 -i bir.json --min_split_size 10240 --skip_split_vns '' --no_split_dram --split_huge_dram_tensor 1.0 --preprocessing_only --max_tensorizer_distance 64 --pack_same_shape_only --instruction_fetch_latency 511 --max-partitions 1 --policy 3 --auxflag 0 --interleave none --schedule-delayed-latency 1 --postsched-mm-accum-reorder=false --max-load-color-rotation --max-load-lower-bound 0.14 --mm-reorder-opt --force-prefetch-follow-incoming-order -1 --allreduce-buffer-size 500 --dram-page-size 512 --dram-rotation-size -1 --allreduce-rotation-dis 8 --repeat-load-thres 4 --enable-mm-transpose-remat-optimization=true --save-len-thres 512 --save-dma-cnt-thres 32 --print-format json --relaxed-order=true --enable-anti-dependence-reduction=false --num-semaphores-per-queue 16 --numcores 1 --act-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/pwp/pwp_bin_trainium/act_info.json --dve-root-json /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json --unified-backend-and-legacy-codegen --enable-verifier=true --enable-birsim=false --enable-birsim-sync-only=false --enable-data-race-checker=false --enable-new-backend=true --inject-error=NONE --dge-levels spill_reload,io,scalar_dynamic_offset,vector_dynamic_offsets --dynamic-dma-scratch-size-per-partition=16384 --neff-output-filename /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: Working directory is /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704 +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: propagate_exit=True +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: use_logger=False +2025-11-04T21:40:45Z INFO 8881 [job.WalrusDriver.0]: expose_stderr=True +2025-11-04T21:40:45Z INFO 9624 [Logging]: Logging to ../log-neuron-cc.txt at level 'INFO' +2025-11-04T21:40:45Z INFO 9624 [BackendDriver]: max_allowed_parallelism=12 +2025-11-04T21:40:45Z INFO 9624 [BackendDriver]: Loading module from nc00/sg00/bir.json +2025-11-04T21:40:45Z INFO 9624 [BackendDriver]: Loading module from nc01/sg00/bir.json +2025-11-04T21:40:46Z INFO 9624 [BackendDriver]: Backend driver mtBackend: false numModules: 2 Cwd: "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704" +2025-11-04T21:40:46Z INFO 9624 [BackendDriver]: DynamicDMA is enabled +2025-11-04T21:40:46Z INFO 9624 [BackendDriver]: DynamicDMA levels being enabled: io, spill_reload, scalar_dynamic_offset, vector_dynamic_offsets, +2025-11-04T21:40:46Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:46Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12176 blocks=2 instructions=10000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:46Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running do_nothing +2025-11-04T21:40:46Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9624 (nc00/sg00) [ModuleForkPass]: do_nothing finished after 0.004 seconds +2025-11-04T21:40:46Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 535mb, ru_maxrss: 973mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to do_nothing: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9624 (nc01/sg00) [ModuleForkPass]: do_nothing finished after 0.005 seconds +2025-11-04T21:40:46Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 535mb, ru_maxrss: 973mb (delta=0mb) +2025-11-04T21:40:46Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5000 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:46Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5000 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:46Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:46Z WARNING 9624 [birverifier::InstVisitor]: (nc00/sg00) Non - output memory location with no reader: {convert.363.63867}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:46Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {convert.363.63867}@SB<0,0>(1x2)#Internal DebugInfo: +2025-11-04T21:40:46Z USER 9624 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.382 seconds +2025-11-04T21:40:46Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1102mb, ru_maxrss: 1102mb (delta=129mb) +2025-11-04T21:40:46Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5000 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.423 seconds +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1146mb, ru_maxrss: 1146mb (delta=173mb) +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5000 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:47Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.448 seconds +2025-11-04T21:40:47Z INFO 9624 [BackendPassManager]: curr_vmrss: 1145mb, ru_maxrss: 1146mb (delta=173mb) +2025-11-04T21:40:47Z USER 9624 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:47Z INFO 9624 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=12176 blocks=2 instructions=10000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:47Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=12176 blocks=2 instructions=10000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.011 seconds +2025-11-04T21:40:47Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 1146mb, ru_maxrss: 1146mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 12176 memory location(s), 2 block(s), and 10000 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:47Z USER 9624 [BackendPassManager]: subgraph_parallel_pass finished after 0.027 seconds +2025-11-04T21:40:47Z INFO 9624 [BackendPassManager]: curr_vmrss: 1146mb, ru_maxrss: 1146mb (delta=0mb) +2025-11-04T21:40:47Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:47Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=12176 blocks=2 instructions=10000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:47Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running expand_replication +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to expand_replication: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:47Z USER 9624 (nc01/sg00) [ModuleForkPass]: expand_replication finished after 0.004 seconds +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 1146mb, ru_maxrss: 1146mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9624 (nc00/sg00) [ExpandReplication]: Found 0 replicated matmults +2025-11-04T21:40:47Z USER 9624 (nc00/sg00) [ModuleForkPass]: expand_replication finished after 0.004 seconds +2025-11-04T21:40:47Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 1146mb, ru_maxrss: 1146mb (delta=0mb) +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5000 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:47Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 6088 memory location(s), 1 block(s), and 5000 instruction(s). Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running unroll +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z INFO 9624 (nc01/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:47Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to unroll: modules=1 functions=1 allocs=6088 blocks=1 instructions=5000 Max writers: 49 Max Readers: 310 +2025-11-04T21:40:47Z INFO 9624 (nc00/sg00) [Unroll]: INFO (Unroll) Start unrolling at Tue Nov 4 21:40:47 2025 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:47 2025 + +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Total count: 144952 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Matmult: 112813 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: GenericCopy: 23041 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Load: 2588 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: TensorScalarPtr: 2122 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: TensorTensor: 1554 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Activation: 817 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Select: 450 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Save: 338 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Memset: 275 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: TensorReduce: 119 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Iota: 58 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: DMACopy: 10 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 10 +2025-11-04T21:40:50Z USER 9624 (nc01/sg00) [ModuleForkPass]: unroll finished after 3.068 seconds +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2845mb, ru_maxrss: 2845mb (delta=1699mb) +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 79124 memory location(s), 1 block(s), and 144952 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:50Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=79124 blocks=1 instructions=144952 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:50Z USER 9624 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.476 seconds +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2508mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:50Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: INFO (Unroll) DONE unrolling Tue Nov 4 21:40:47 2025 + +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: sg0000 Instruction count after Unroll: +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Total count: 146112 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Matmult: 112813 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: GenericCopy: 23041 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: TensorScalarPtr: 2682 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Load: 2588 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: TensorTensor: 1554 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Activation: 817 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Select: 450 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Iota: 394 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Save: 378 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Memset: 275 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: CollectiveCompute: 260 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: DMACopy: 234 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Max: 128 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: MaxIndexAndMatchReplace: 128 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: TensorReduce: 119 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Gather: 96 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: CoreBarrier: 71 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Reciprocal: 59 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: StreamShuffle: 24 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: StreamTranspose: 1 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [Unroll]: Unrolled DGE count with Dynamic AP: 234 +2025-11-04T21:40:50Z USER 9624 (nc00/sg00) [ModuleForkPass]: unroll finished after 3.896 seconds +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2544mb, ru_maxrss: 2845mb (delta=1699mb) +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 79124 memory location(s), 1 block(s), and 146112 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:50Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o1 +2025-11-04T21:40:50Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o1: modules=1 functions=1 allocs=79124 blocks=1 instructions=146112 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [DeadCodeElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [DeadCodeElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [DeadCodeElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [DeadCodeElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:51Z USER 9624 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o1 finished after 0.437 seconds +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2544mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 4.372 seconds +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: curr_vmrss: 2544mb, ru_maxrss: 2845mb (delta=1699mb) +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=73564 blocks=2 instructions=289464 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:40:51Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=73564 blocks=2 instructions=289464 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.023 seconds +2025-11-04T21:40:51Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2544mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73564 memory location(s), 2 block(s), and 289464 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: subgraph_parallel_pass finished after 0.044 seconds +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: curr_vmrss: 2544mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73564 blocks=2 instructions=289464 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:51Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47757_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47766_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47775_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47784_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47793_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47802_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47811_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47820_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47829_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47838_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47847_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47856_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47865_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47874_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47883_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47892_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47901_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47910_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47919_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47928_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47937_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47946_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47955_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47964_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47973_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47982_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47991_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t48000_i1}@SB<0,0>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_28981_45158_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:51Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_28985_45163_i1}@SB<0,0>(8x1024)#Internal DebugInfo: +2025-11-04T21:40:51Z USER 9624 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.414 seconds +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z USER 9624 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.415 seconds +2025-11-04T21:40:51Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:51Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.437 seconds +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=73564 blocks=2 instructions=289464 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:40:51Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=73564 blocks=2 instructions=289464 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.054 seconds +2025-11-04T21:40:51Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73564 memory location(s), 2 block(s), and 289464 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: subgraph_parallel_pass finished after 0.079 seconds +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:51Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:40:51Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73564 blocks=2 instructions=289464 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:51Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:51Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running instruction_reorder +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to instruction_reorder: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: instruction_reorder finished after 0.041 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: instruction_reorder finished after 0.045 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running psum_legalization +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to psum_legalization: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: psum_legalization finished after 0.032 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: psum_legalization finished after 0.036 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.158 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.171 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running legalize_cce_dma +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.025 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to legalize_cce_dma: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z WARNING 9624 (nc01/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: error_injector finished after 0.005 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 0 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: legalize_cce_dma finished after 0.026 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running error_injector +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to error_injector: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z WARNING 9624 (nc00/sg00) [ErrorInjector]: Unrecognized injected error value "0" +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: error_injector finished after 0.006 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2549mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running vn_splitter +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to vn_splitter: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Collected all the internal vnodes: size = 5 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [VNSplitter]: INFO (VNSplitter) Done with analyze and splitting: total dead nodes = 0 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ShrinkDN]: INFO (ShrinkDN): Shrunk 2 nodes. Total savings 14336 bytes/partition +2025-11-04T21:40:52Z INFO 9624 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:52Z INFO 9624 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.005 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.062 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.079 seconds +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: vn_splitter finished after 0.194 seconds +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2550mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:52Z INFO 9624 [PerformanceProfiler]: number of tensorizer non-local-tensor caused reload left 0 +2025-11-04T21:40:52Z INFO 9624 [PerformanceProfiler]: number of tensorizer non-local-tensor caused spill left 0 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [VNSplitterPass]: INFO (VNSplitter) Time: 0.006 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [VNSplitterPass]: INFO (VerticalFusion) Time: 0.07 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [VNSplitterPass]: INFO (ShrinkDN) Time: 0.074 seconds +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: vn_splitter finished after 0.198 seconds +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2550mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running constant_propagate +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to constant_propagate: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for select] directly remove instruction number: 0 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:52Z INFO 9624 (nc00/sg00) [ConstantPropagate]: [Constant_propagate for Affineselect] directly remove instruction number: 0 +2025-11-04T21:40:52Z INFO 9624 (nc01/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ConstantPropagate]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ConstantPropagate]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: constant_propagate finished after 0.738 seconds +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2550mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ConstantPropagate]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: lower_ac finished after 0.043 seconds +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2550mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ConstantPropagate]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z USER 9624 (nc00/sg00) [ModuleForkPass]: constant_propagate finished after 0.770 seconds +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2550mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:53Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running lower_ac +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to lower_ac: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [LowerAC]: INFO (LowerAC) Lowered 0 loads, 0 saves, 0 copies. +2025-11-04T21:40:53Z USER 9624 (nc00/sg00) [ModuleForkPass]: lower_ac finished after 0.044 seconds +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2550mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:53Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running input_dma_coalescing +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to input_dma_coalescing: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.127 seconds +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2551mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA input Coalescing combined 0 input loads +2025-11-04T21:40:53Z USER 9624 (nc00/sg00) [ModuleForkPass]: input_dma_coalescing finished after 0.132 seconds +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2551mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:53Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running remat_optimization +2025-11-04T21:40:53Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to remat_optimization: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: remat_optimization finished after 0.543 seconds +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2552mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.031 seconds +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2552mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:53Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:53Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [RematOpt]: Removed 0 remat instructions +2025-11-04T21:40:54Z USER 9624 (nc00/sg00) [ModuleForkPass]: remat_optimization finished after 0.534 seconds +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2552mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:54Z USER 9624 (nc01/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.023 seconds +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2552mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:54Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running coalesce_multichannel_cc_ops +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36054 memory location(s), 1 block(s), and 143363 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:54Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to coalesce_multichannel_cc_ops: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=36054 blocks=1 instructions=143363 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:54Z USER 9624 (nc00/sg00) [ModuleForkPass]: coalesce_multichannel_cc_ops finished after 0.030 seconds +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2552mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:54Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running infer_stream_ids +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to infer_stream_ids: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:54Z USER 9624 (nc00/sg00) [ModuleForkPass]: infer_stream_ids finished after 0.025 seconds +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2552mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37510 memory location(s), 1 block(s), and 146101 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:54Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running pre_sched +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to pre_sched: modules=1 functions=1 allocs=37510 blocks=1 instructions=146101 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: Start PRE scheduling 2 cores: 1 at: Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: Start... +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: To Spill 17 multi-layer tensors +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: Found 253 Splits CCs +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: Grouped CCs to 253 clusters. +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: To Spill 18 multi-layer tensors +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: set uninit flag on 0 insts +2025-11-04T21:40:54Z INFO 9624 [LayerSpiller]: LayerSpill: Done. +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: Start split live ranges Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: remove_redundant_memsets: 0 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: Num_Splits: 1 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: End split live ranges Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: Strt remove redundncies Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: remove_redundant_memsets +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: remove_redundant_memsets: 4 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: remove_redundant_loads +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: remove_redundant_loads: 0 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: End remove redundncies Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: Start DCE Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: End DCE Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 1Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:54Z INFO 9624 (nc01/sg00) [build_flow_deps]: Allocs: 36090 instructions: 143399 +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:54Z INFO 9624 (nc00/sg00) [PreSched]: End DCE Tue Nov 4 21:40:54 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: Start build flow dependencies Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 2Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [build_flow_deps]: Allocs: 37548 instructions: 146135 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 364668 edges +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [build_flow_deps]: Done build fdeps 364668 Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 383749 edges +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [build_flow_deps]: Done build fdeps 383749 Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: End build flow dependencies Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: Start remove useless insts Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: remove_useless_insts +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: remove Useless Instructions: 0 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: End remove useless insts Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: Start scratchpad optimization Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: End scratchpad optimization Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z USER 9624 (nc01/sg00) [ModuleForkPass]: pre_sched finished after 1.579 seconds +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2607mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36090 memory location(s), 1 block(s), and 143399 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:55Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=36090 blocks=1 instructions=143399 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 1 +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [PreSched]: DONE PRE scheduling Tue Nov 4 21:40:55 2025 +2025-11-04T21:40:55Z USER 9624 (nc00/sg00) [ModuleForkPass]: pre_sched finished after 1.691 seconds +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2141mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37548 memory location(s), 1 block(s), and 146135 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:55Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=37548 blocks=1 instructions=146135 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 31 +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:55Z INFO 9624 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:40:55Z INFO 9624 (nc01/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.376 seconds +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2143mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36089 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=36089 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.003 seconds +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2143mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36090 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=36090 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.002 seconds +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2143mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.019 seconds +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2143mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.019 seconds +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2143mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [TensorCopyElim]: remove_must_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [TensorCopyElim]: remove_redundant_alias_dmacopy removed 0 DMAcopys +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [TensorCopyElim]: remove_redundant_internal2internal_dmacopy removed 0 DMAcopys +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.407 seconds +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2143mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37517 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_setup +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_setup: modules=1 functions=1 allocs=37517 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: dynamic_dma_setup finished after 0.004 seconds +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2144mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37518 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running runtime_memory_reservation +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to runtime_memory_reservation: modules=1 functions=1 allocs=37518 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: runtime_memory_reservation finished after 0.002 seconds +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2144mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.020 seconds +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2144mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.020 seconds +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2144mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_psum +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_psum: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: size = 23522 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: found 61557 edges +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: mean: 5.23399 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: median: 6.79793 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: adjacency vectors require 492456 bytes +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: allocating PSUM +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: main loop +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: renumber locations +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: size = 23694 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: lo = 23448 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: total = 23522 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap start +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.641 seconds +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2156mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: 100% PSUM demand before spilling +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: PSUM high-water mark = 8 tensors +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: found 61615 edges +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: mean: 5.20089 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: median: 6.77627 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: adjacency vectors require 492920 bytes +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: build_no_bitmap done +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: find costs +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: best-of-n loop, heuristic = 0, allow_psum_spill_within_accum_group = false +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: simplify interference graph +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: initialize low and high +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: lo = 23620 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: hi = 74 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: inf = 0 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: total = 23694 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: simplify +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: new candidates = 0 +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: select ranges +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: no more spills +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: PSUM score = 0 (lower is better) +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: spilling from PSUM cost about 0 cycles +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [PSUM_Allocator]: 100% PSUM utilization after allocation +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: coloring_allocator_psum finished after 0.694 seconds +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2159mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_psum +2025-11-04T21:40:56Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_psum: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:56Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:57Z USER 9624 (nc01/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.277 seconds +2025-11-04T21:40:57Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2159mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:57Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:57Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:57Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload instructions +2025-11-04T21:40:57Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [psum spill optimization]: removed 0 spill/reload memory locations +2025-11-04T21:40:57Z USER 9624 (nc00/sg00) [ModuleForkPass]: dma_optimization_psum finished after 0.292 seconds +2025-11-04T21:40:57Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2168mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:57Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:57Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running address_rotation_psum +2025-11-04T21:40:57Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_psum: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:57Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 9 PSUM Banks +2025-11-04T21:40:57Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 40 PSUM Banks +2025-11-04T21:40:57Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 211 PSUM Banks +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 617 PSUM Banks +2025-11-04T21:40:58Z USER 9624 (nc01/sg00) [ModuleForkPass]: address_rotation_psum finished after 1.198 seconds +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2193mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:58Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 314 PSUM Banks +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1822124622 +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5701 bytes +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 1787904 +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 311 bytes +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: size = 11981 +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: found 22847 accumulation groups +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: largest = 38037.56769_i1 +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: requires 8704 bytes/partition +2025-11-04T21:40:58Z INFO 9624 (nc01/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 640 PSUM Banks +2025-11-04T21:40:58Z USER 9624 (nc00/sg00) [ModuleForkPass]: address_rotation_psum finished after 1.577 seconds +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2196mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:58Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_sb +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_sb: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes loaded 1827224246 +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average loaded DMA size 5702 bytes +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA DRAM bytes saved 2820202 +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Pre GCA average saved DMA size 429 bytes +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:40:58Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:40:59Z INFO 9624 []: find first defs for local +2025-11-04T21:40:59Z INFO 9624 []: find first defs for global +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: allocating SB +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: main loop +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: renumber locations +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: size = 13229 +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: find partners +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: found 23019 accumulation groups +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: largest = 38037.56769_i0 +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: tensors = 2 +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: requires 8704 bytes/partition +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: expanding partners +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: 2578 remat count +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:59Z INFO 9624 []: find first defs for local +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Num intervals 11981 Num locations 11981 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:40:59Z INFO 9624 []: find first defs for global +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: edge: 339927 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: mean: 56.7443 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: median: 34.2156 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: find costs +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: safe = 10583 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: unsafe = 900 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: inf = 496 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: total = 11979 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: simplify +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 54 #Pinned 0 #Safe 0 minCost 0.0173206 maxCost 6.25704 locations 11981 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: new candidates = 16 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: select ranges +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Total: 11979 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Allocated: 1.000 (11979) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Rover zone: 0.931 (11153) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Pre-rover zone: 0.064 (764) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Post-rover zone: 0.005 (62) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Slice zone: 0.000 (0) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Blocks nothing: 0.071 (848) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Blocks medium: 0.000 (1) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.758 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.000 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.000 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Blocks tall: 0.929 (11130) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.539 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.587 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: Success +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: find loads +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: 2 pin count +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: 2587 remat count +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: 2 pinned tensors will require about 16392 bytes/partition +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: build interference graph +2025-11-04T21:40:59Z INFO 9624 (nc00/sg00) [SB_Allocator]: pass 1 int-tree +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1822124622 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5701 bytes +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 1787904 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 311 bytes +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 16416 +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 7 bytes +2025-11-04T21:40:59Z USER 9624 (nc01/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 1.685 seconds +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2163mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:40:59Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:40:59Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Num intervals 13229 Num locations 13229 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: IntervalTree Build Done +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: info.neighbors init Done +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: info.neighbors partners Done +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: IntervalTree readback Done +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: edge: 386369 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: mean: 58.4124 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: median: 35.3151 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: find costs +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: best-of-n loop, heuristic = 0 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: simplify interference graph +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: initialize safe & unsafe +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: safe = 11827 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: unsafe = 904 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: inf = 496 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: total = 13227 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: simplify +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: simplify_step3_sorted2 #Unsafe 458 #Pinned 0 #Safe 0 minCost 0.00475299 maxCost 0.509992 locations 13229 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: new candidates = 17 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: select ranges +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Total: 13227 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Spilled: 0.000 (0) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Allocated: 1.000 (13227) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Rover zone: 0.928 (12272) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Pre-rover zone: 0.059 (781) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Post-rover zone: 0.013 (170) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Slice zone: 0.000 (4) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Blocks nothing: 0.067 (880) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Blocks medium: 0.004 (47) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (mean): 0.520 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (median): 0.506 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Visited until medium blocking (p95): 0.930 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Blocks tall: 0.930 (12300) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (mean): 0.497 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (median): 0.609 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Visited until tall blocking (p95): 1.000 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: Success +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:00Z USER 9624 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.372 seconds +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2172mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36091 memory location(s), 1 block(s), and 143398 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:00Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=36091 blocks=1 instructions=143398 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1823912526, 98.7015% input load, 0% output write, 1.29853% spill/reload [sg0000] +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: SB spills = 0 tensors +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: remats = 0 tensors +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: unpinned = 0 tensors +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: size = 0 bytes/partition +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: SB score = 0 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: spilling from SB cost about 0 cycles +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: 16392 bytes/partition (100%) successfully pinned +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: pinning saved approximately 8300 cycles +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [SB_Allocator]: 0% SB utilization after allocation +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes loaded 1827224246 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average loaded DMA size 5702 bytes +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes saved 2820202 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average saved DMA size 429 bytes +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA DRAM bytes DMACopyed 482400 +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: INFO: Post GCA average DMACopyed DMA size 85 bytes +2025-11-04T21:41:00Z USER 9624 (nc00/sg00) [ModuleForkPass]: coloring_allocator_sb finished after 2.016 seconds +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2175mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:00Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:00Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:41:00Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 0, 0% out of total dma traffic(1.80023e+09) +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:01Z USER 9624 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 0.417 seconds +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2178mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37519 memory location(s), 1 block(s), and 146104 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:01Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dma_optimization_sb +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dma_optimization_sb: modules=1 functions=1 allocs=37519 blocks=1 instructions=146104 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA optimization In bytes loaded or saved 1830044448, 98.5116% input load, 1.74859e-06% output write, 1.48835% spill/reload [sg0000] +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: removed 0 identical load +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: adjusted 0 DMACopy remat +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: sub-graph will get execute 1 times +2025-11-04T21:41:01Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Load Merging]: removed 0 remat/cloned instructions +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:41:01Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Load shrink]: shrinked 0 GCA remat/cloned instructions +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Load Merging + Load shrink] reduced input/const loading DMA traffic 65536, 0.00358112% out of total dma traffic(1.80281e+09) +2025-11-04T21:41:02Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:02Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:41:02Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:41:02Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload instructions +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Allocation optimization]: removed 0 spill/reload memory locations +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Re-allocation Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload instructions +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [spill optimization round 0]: removed 0 spill/reload memory locations +2025-11-04T21:41:02Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Spill Optimization] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:02Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 28 SpillSaves and Reloads +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 5717 bytes +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 368 bytes +2025-11-04T21:41:03Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [remove extra save] removed 0 memlocs and 0 instructions +2025-11-04T21:41:03Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload instructions +2025-11-04T21:41:03Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [remove_memset_spill]: removed 0 spill/reload memory locations +2025-11-04T21:41:03Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: eliminateDeadStore removed 0 instructions +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 12 SpillSaves and Reloads +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 5724 bytes +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 400 bytes +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 4 SpillSaves and Reloads +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: average loaded DMA size 5726 bytes +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: average saved DMA size 412 bytes +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1822124622 +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5726 bytes +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 1787904 +2025-11-04T21:41:03Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 412 bytes +2025-11-04T21:41:03Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 0 combined 12 SpillSaves and Reloads +2025-11-04T21:41:03Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 5711 bytes +2025-11-04T21:41:03Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 456 bytes +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 0, 0% out of total dma traffic +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1823912526, 98.7015% input load, 0% output write, 1.29853% spill/reload [sg0000] +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1822124622 +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5726 bytes +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 1787904 +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 412 bytes +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 16416 +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 7 bytes +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 5613 bytes +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:41:04Z USER 9624 (nc01/sg00) [ModuleForkPass]: dma_optimization_sb finished after 3.867 seconds +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2185mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143376 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:04Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=36056 blocks=1 instructions=143376 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 1 combined 12 SpillSaves and Reloads +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 5718 bytes +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 486 bytes +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 169 Sb address +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 2 combined 4 SpillSaves and Reloads +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 5720 bytes +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 497 bytes +2025-11-04T21:41:04Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1609 Sb address +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA SpillSave Coalescing Round 3 combined 0 SpillSaves and Reloads +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average loaded DMA size 5720 bytes +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: average saved DMA size 497 bytes +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes loaded 1827158710 +2025-11-04T21:41:04Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average loaded DMA size 5720 bytes +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing DRAM bytes saved 2820202 +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA coalescing average saved DMA size 497 bytes +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization]Reload_just_for_save Optimization removed 0 memlocs +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [Experiment partial DMA access] reduced DMA traffic 0, 0% out of total spill/reload dma traffic +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: [DMA optimization] reduced DMA traffic 65536, 0.00358112% out of total dma traffic +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA optimization Out bytes loaded or saved 1829978912, 98.5116% input load, 1.74865e-06% output write, 1.48841% spill/reload [sg0000] +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes loaded 1827158710 +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average loaded DMA size 5720 bytes +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes saved 2820202 +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average saved DMA size 497 bytes +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization DRAM bytes DMAcopyed 482400 +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMAcopyed DMA size 85 bytes +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Post DMA optimization average DMA size 5533 bytes +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: INFO: Finished set_spill_canreadUninit(module); +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DMA optimization re-enable optimization +2025-11-04T21:41:05Z USER 9624 (nc00/sg00) [ModuleForkPass]: dma_optimization_sb finished after 3.992 seconds +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2187mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146089 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:05Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=37495 blocks=1 instructions=146089 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:05Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 279 Sb address +2025-11-04T21:41:05Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 192 Sb address +2025-11-04T21:41:05Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 57 Sb address +2025-11-04T21:41:06Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1556 Sb address +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 1385 Sb address +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:06Z USER 9624 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 2.212 seconds +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2189mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143376 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:06Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=36056 blocks=1 instructions=143376 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: reserved space = 166144 bytes +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: spill space = 83968 bytes +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 94208 bytes +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: size = 7 +2025-11-04T21:41:06Z INFO 9624 []: find first defs for local +2025-11-04T21:41:06Z INFO 9624 []: find first defs for global +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: Num intervals 7 Num locations 7 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: lo = 7 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: total = 7 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 94208 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:41:06Z USER 9624 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.393 seconds +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2201mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143376 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:06Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=36056 blocks=1 instructions=143376 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 91136 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:41:06Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 91136 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.240 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2204mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143376 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=36056 blocks=1 instructions=143376 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [TensorCopyAccel::Impl]: Accelerated 4765 out of 23029 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.065 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143376 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=36056 blocks=1 instructions=143376 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: peephole_opts finished after 0.097 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 143826, number of allocs: 36056 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [LowerKernel]: Scan BKs time (s): 0.03804 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [LowerKernel]: Lower BKs time (s): 1e-06 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: lower_kernel finished after 0.024 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.022 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.022 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.233 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.066 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2217mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.413-t47757_i1}@SB<0,19024>(128x256)#Internal DebugInfo: <_dot.413||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.757-t47766_i1}@SB<0,29152>(128x256)#Internal DebugInfo: <_dot.757||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1101-t47775_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.1101||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1445-t47784_i1}@SB<0,22992>(128x256)#Internal DebugInfo: <_dot.1445||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.1789-t47793_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.1789||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2133-t47802_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.2133||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2477-t47811_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.2477||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.2821-t47820_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.2821||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3165-t47829_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.3165||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3509-t47838_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.3509||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.3853-t47847_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.3853||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4197-t47856_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.4197||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4541-t47865_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.4541||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.4885-t47874_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.4885||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5229-t47883_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.5229||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5573-t47892_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.5573||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.5917-t47901_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.5917||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6261-t47910_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.6261||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6605-t47919_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.6605||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.6949-t47928_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.6949||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7293-t47937_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.7293||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7637-t47946_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.7637||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.7981-t47955_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.7981||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8325-t47964_i1}@SB<0,22992>(128x256)#Internal DebugInfo: <_dot.8325||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.8669-t47973_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.8669||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9013-t47982_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.9013||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9357-t47991_i1}@SB<0,22736>(128x256)#Internal DebugInfo: <_dot.9357||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {_dot.9701-t48000_i1}@SB<0,26832>(128x256)#Internal DebugInfo: <_dot.9701||UNDEF||[128, 128, 1]> +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {divide.57_pftranspose_28981_45158_i1}@SB<32,16552>(8x1024)#Internal DebugInfo: +2025-11-04T21:41:07Z WARNING 9624 [birverifier::InstVisitor]: (nc01/sg00) Non - output memory location with no reader: {select.30_pftranspose_28985_45163_i1}@SB<96,17672>(8x1024)#Internal DebugInfo: +2025-11-04T21:41:07Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 285 Sb address +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.364 seconds +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2219mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:07Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:41:07Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:08Z USER 9624 (nc01/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.025 seconds +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2219mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:08Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 3Tue Nov 4 21:41:08 2025 +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [build_flow_deps]: Allocs: 36056 instructions: 143826 +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 365554 edges +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [build_flow_deps]: Done build fdeps 365554 Tue Nov 4 21:41:08 2025 +2025-11-04T21:41:08Z USER 9624 (nc01/sg00) [ModuleForkPass]: build_fdeps finished after 0.417 seconds +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:08Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:41:08Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 87 Sb address +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:41:08Z USER 9624 (nc01/sg00) [ModuleForkPass]: remove_redundancies finished after 0.078 seconds +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2249mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:08Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:08Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:08Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 1213 Sb address +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 0 Sb address +2025-11-04T21:41:09Z USER 9624 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 4.057 seconds +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2461mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146089 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:09Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram: modules=1 functions=1 allocs=37495 blocks=1 instructions=146089 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:09Z USER 9624 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.919 seconds +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2467mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:09Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Local +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: reserved space = 166152 bytes +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: spill space = 116800 bytes +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 147456 bytes +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: size = 17 +2025-11-04T21:41:09Z INFO 9624 []: find first defs for local +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:41:09Z INFO 9624 []: find first defs for global +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:41:09Z USER 9624 (nc01/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.341 seconds +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2468mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: Num intervals 17 Num locations 17 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: lo = 17 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: total = 17 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:09Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 0 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 0 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 110592 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=36056 blocks=1 instructions=143826 Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:09Z USER 9624 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram finished after 0.438 seconds +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2468mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146089 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:09Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running address_rotation_dram +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_dram: modules=1 functions=1 allocs=37495 blocks=1 instructions=146089 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: Runtime page size at 512MB +2025-11-04T21:41:09Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm before rotation 110592 +2025-11-04T21:41:09Z USER 9624 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.150 seconds +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2469mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:09Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36056 memory location(s), 1 block(s), and 143826 instruction(s). Max writers: 298 Max Readers: 35741 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: allreduce buffer size 524288000 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: allreduce hwm 32768 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: Real CC buffer size 32768 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DRAM hwm after rotation 110592 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: DRAM Rotation rotated 0 Dram address +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: address_rotation_dram finished after 0.248 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2469mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146089 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running tensorcopy_accel +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to tensorcopy_accel: modules=1 functions=1 allocs=37495 blocks=1 instructions=146089 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [TensorCopyAccel::Impl]: Running peephole optimization pass +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [TensorCopyAccel::Impl]: Accelerated 4877 out of 23274 tensorcopy in Function: sg0000 average acceleration factor: 1 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: tensorcopy_accel finished after 0.052 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2469mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146089 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running peephole_opts +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to peephole_opts: modules=1 functions=1 allocs=37495 blocks=1 instructions=146089 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [PeepholeOpts]: PeepholeOpts enabled? Recip: true Tsp: true Tc: false SplitSelect: true SimplifyMemset true +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: peephole_opts finished after 0.089 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2469mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running lower_kernel +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to lower_kernel: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [LowerKernel]: Started running LowerKernel +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [LowerKernel]: BIR SB coloring allocator is disabled +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [LowerKernel]: Start of kernel lowering pass, number of insts: 146539, number of allocs: 37495 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [LowerKernel]: Scan BKs time (s): 0.018503 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [LowerKernel]: Lower BKs time (s): 2e-06 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: lower_kernel finished after 0.024 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2469mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running lower_klir_kernel +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to lower_klir_kernel: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: lower_klir_kernel finished after 0.021 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2469mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running lower_nki_kernel +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to lower_nki_kernel: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: lower_nki_kernel finished after 0.020 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2469mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running non_ssa_legalization +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to non_ssa_legalization: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [NonSSALeg]: remove_redundant_loads +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [NonSSALeg]: remove_redundant_loads: 0 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [NonSSALeg]: [Non-SSA legalization]created 0 memorylocations +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: non_ssa_legalization finished after 0.129 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2470mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_cleanup +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_cleanup: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: dynamic_dma_cleanup finished after 0.026 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2470mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.262 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2471mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dynamic_dma_scan +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dynamic_dma_scan: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: dynamic_dma_scan finished after 0.026 seconds +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2471mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running build_fdeps +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to build_fdeps: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 4Tue Nov 4 21:41:10 2025 +2025-11-04T21:41:10Z INFO 9624 (nc00/sg00) [build_flow_deps]: Allocs: 37495 instructions: 146539 +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 384609 edges +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [build_flow_deps]: Done build fdeps 384609 Tue Nov 4 21:41:11 2025 +2025-11-04T21:41:11Z USER 9624 (nc00/sg00) [ModuleForkPass]: build_fdeps finished after 0.390 seconds +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2502mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:11Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running remove_redundancies +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to remove_redundancies: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [RemoveRedundancies]: remove_clobbered_writes: 0 +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [RemoveRedundancies]: remove_useless_insts +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [RemoveRedundancies]: remove Useless Instructions: 0 +2025-11-04T21:41:11Z USER 9624 (nc00/sg00) [ModuleForkPass]: remove_redundancies finished after 0.081 seconds +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2502mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:11Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:11Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:12Z USER 9624 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.895 seconds +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2601mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running tensor_copy_elim +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to tensor_copy_elim: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [TensorCopyElim]: Tensor CP elimination: 0 +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [TensorCopyElim]: eliminateDeadStore removed 0 instructions +2025-11-04T21:41:12Z USER 9624 (nc00/sg00) [ModuleForkPass]: tensor_copy_elim finished after 0.279 seconds +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2332mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=37495 blocks=1 instructions=146539 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.126 seconds +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2332mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:12Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37495 memory location(s), 1 block(s), and 146539 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:12Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 20.581 seconds +2025-11-04T21:41:12Z INFO 9624 [BackendPassManager]: curr_vmrss: 2332mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:12Z USER 9624 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:12Z INFO 9624 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=73551 blocks=2 instructions=290365 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:41:12Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=73551 blocks=2 instructions=290365 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.016 seconds +2025-11-04T21:41:12Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2332mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:12Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73551 memory location(s), 2 block(s), and 290365 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (sg00) [SubgraphForkPass]: Running lower_local_collectives +2025-11-04T21:41:12Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to lower_local_collectives: modules=2 functions=2 allocs=73551 blocks=2 instructions=290365 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (sg00) [SubgraphForkPass]: lower_local_collectives finished after 0.104 seconds +2025-11-04T21:41:12Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2332mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:12Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 291279 instruction(s). Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:12Z USER 9624 (sg00) [SubgraphForkPass]: Running extend_shared_lifetimes +2025-11-04T21:41:12Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to extend_shared_lifetimes: modules=2 functions=2 allocs=73949 blocks=2 instructions=291279 Max writers: 298 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 (sg00) [SubgraphForkPass]: extend_shared_lifetimes finished after 0.341 seconds +2025-11-04T21:41:13Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2332mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 291283 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: subgraph_parallel_pass finished after 0.498 seconds +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: curr_vmrss: 2332mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291283 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:41:13Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running coloring_allocator_dram_shared +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to coloring_allocator_dram_shared: modules=1 functions=1 allocs=36255 blocks=1 instructions=144285 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: allocating spills in DRAM pre_link mode for address space Shared +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: reserved space = 282952 bytes +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: reserved space = 250112 bytes +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: spill space = 6307842 bytes +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: aligned spill space = 6316032 bytes +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: dram space = 107374182400 bytes +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: renumber locations +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: size = 132 +2025-11-04T21:41:13Z INFO 9624 []: find first defs for local +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [DRAM_Allocator]: Skipping shared tensor allocations on core 1, marking as remoteLocalTarget instead +2025-11-04T21:41:13Z USER 9624 (nc01/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.176 seconds +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2347mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144285 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:13Z INFO 9624 []: find first defs for global +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: Num intervals 132 Num locations 132 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: IntervalTree Build Done +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: info.neighbors init Done +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: IntervalTree readback Done +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: simplify interference graph +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: initialize low and high +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: lo = 132 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: hi = 0 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: total = 132 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: simplify +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: new candidates = 0 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 110592 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: Fall back to default allocation strategy [Core0 Local, Shared] +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: Already used DRAM hwm: 110592 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: select ranges +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: CC buffer size limit 524288000 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: allreduce_dram_hwm 3878912 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: Real CC buffer size 3878912 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: DRAM hwm after allocation: 6332416 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [DRAM_Allocator]: DRAM allocation successful +2025-11-04T21:41:13Z USER 9624 (nc00/sg00) [ModuleForkPass]: coloring_allocator_dram_shared finished after 0.429 seconds +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2369mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.446 seconds +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: curr_vmrss: 2369mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291283 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 (sg00) [SubgraphForkPass]: Running sync_shared_allocations +2025-11-04T21:41:13Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to sync_shared_allocations: modules=2 functions=2 allocs=73949 blocks=2 instructions=291283 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 (sg00) [SubgraphForkPass]: sync_shared_allocations finished after 0.013 seconds +2025-11-04T21:41:13Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2369mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 291283 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: subgraph_parallel_pass finished after 0.029 seconds +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: curr_vmrss: 2369mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291283 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:41:13Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer_post_shared_dram +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer_post_shared_dram: modules=1 functions=1 allocs=36255 blocks=1 instructions=144285 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM} +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:13Z USER 9624 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.109 seconds +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2369mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z USER 9624 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer_post_shared_dram finished after 0.113 seconds +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2369mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144285 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:13Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.125 seconds +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: curr_vmrss: 2369mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:13Z USER 9624 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:13Z INFO 9624 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291283 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z USER 9624 (nc00) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:41:13Z USER 9624 (nc01) [CoreForkPass]: Running memory_analysis_after_coloring_allocator_dram_shared +2025-11-04T21:41:13Z INFO 9624 (nc00) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:13Z INFO 9624 (nc01) [CoreForkPass]: Inputs to memory_analysis_after_coloring_allocator_dram_shared: modules=1 functions=1 allocs=36255 blocks=1 instructions=144285 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:14Z USER 9624 (nc01) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.545 seconds +2025-11-04T21:41:14Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 2398mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144285 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:14Z USER 9624 (nc00) [CoreForkPass]: memory_analysis_after_coloring_allocator_dram_shared finished after 0.561 seconds +2025-11-04T21:41:14Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 2398mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:14Z USER 9624 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:14Z USER 9624 [BackendPassManager]: nc_parallel_pass finished after 0.585 seconds +2025-11-04T21:41:14Z INFO 9624 [BackendPassManager]: curr_vmrss: 2398mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:14Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:14Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291283 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:14Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:41:14Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_before_sched +2025-11-04T21:41:14Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:14Z USER 9624 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.004 seconds +2025-11-04T21:41:14Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2398mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:14Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:41:14Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_before_sched: modules=1 functions=1 allocs=36255 blocks=1 instructions=144285 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:14Z USER 9624 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_before_sched finished after 0.009 seconds +2025-11-04T21:41:14Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2398mb, ru_maxrss: 2845mb (delta=0mb) +2025-11-04T21:41:14Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:14Z INFO 9624 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:41:14Z INFO 9624 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:41:14Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144285 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:14Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running post_sched +2025-11-04T21:41:14Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to post_sched: modules=1 functions=1 allocs=36255 blocks=1 instructions=144285 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:14Z INFO 9624 [PostSched]: Detected modules.size() == 1; running LNC=1 post_sched +2025-11-04T21:41:14Z INFO 9624 [PostSched]: Detected --lnc_aware_scheduler=false; running LNC=1 post_sched +2025-11-04T21:41:14Z INFO 9624 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:41:14 2025 +2025-11-04T21:41:14Z INFO 9624 [post_scheduler]: Start PosT ScheD 3 gen3 Tue Nov 4 21:41:14 2025 +2025-11-04T21:41:15Z INFO 9624 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:41:15Z INFO 9624 [post_scheduler]: Time-aware hwm post-sched +2025-11-04T21:41:19Z INFO 9624 [post_scheduler]: Time-aware simulation time: 11228166 +2025-11-04T21:41:19Z INFO 9624 [post_scheduler]: Time-aware simulation time: 11709617 +2025-11-04T21:41:20Z INFO 9624 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:20 2025 +2025-11-04T21:41:20Z USER 9624 (nc01/sg00) [ModuleForkPass]: post_sched finished after 5.758 seconds +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 3103mb, ru_maxrss: 3103mb (delta=258mb) +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144285 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:20Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=36255 blocks=1 instructions=144285 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:20Z USER 9624 (nc01/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.025 seconds +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2920mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144285 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:20Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=36255 blocks=1 instructions=144285 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:20Z INFO 9624 [post_scheduler]: Done PosT ScheD Tue Nov 4 21:41:20 2025 +2025-11-04T21:41:20Z USER 9624 (nc00/sg00) [ModuleForkPass]: post_sched finished after 5.904 seconds +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2923mb, ru_maxrss: 3103mb (delta=258mb) +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running expand_scheduling_units +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to expand_scheduling_units: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 (nc00/sg00) [ModuleForkPass]: expand_scheduling_units finished after 0.025 seconds +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2876mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dead_code_elim_o0 +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dead_code_elim_o0: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 (nc01/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.144 seconds +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2878mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144253 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:20Z USER 9624 (nc00/sg00) [ModuleForkPass]: dead_code_elim_o0 finished after 0.141 seconds +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2878mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:20Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 6.104 seconds +2025-11-04T21:41:20Z INFO 9624 [BackendPassManager]: curr_vmrss: 2876mb, ru_maxrss: 3103mb (delta=258mb) +2025-11-04T21:41:20Z USER 9624 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:20Z INFO 9624 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291251 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 (sg00) [SubgraphForkPass]: Running localize_shared_memory +2025-11-04T21:41:20Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to localize_shared_memory: modules=2 functions=2 allocs=73949 blocks=2 instructions=291251 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 (sg00) [SubgraphForkPass]: localize_shared_memory finished after 0.014 seconds +2025-11-04T21:41:20Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2876mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:20Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 291251 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:20Z USER 9624 [BackendPassManager]: subgraph_parallel_pass finished after 0.025 seconds +2025-11-04T21:41:20Z INFO 9624 [BackendPassManager]: curr_vmrss: 2876mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:20Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:20Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291251 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:20Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:20Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running address_rotation_sb +2025-11-04T21:41:20Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=36255 blocks=1 instructions=144253 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:20Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to address_rotation_sb: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:21Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 14203 PSUM Banks +2025-11-04T21:41:21Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 14339 PSUM Banks +2025-11-04T21:41:22Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 298 PSUM Banks +2025-11-04T21:41:22Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 312 PSUM Banks +2025-11-04T21:41:22Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 10341 PSUM Banks +2025-11-04T21:41:22Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: PSUM Rotation rotated 9853 PSUM Banks +2025-11-04T21:41:22Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 11 Sb address +2025-11-04T21:41:23Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:41:23Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 31 Sb address +2025-11-04T21:41:23Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 36 Sb address +2025-11-04T21:41:23Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 71 Sb address +2025-11-04T21:41:23Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 174 Sb address +2025-11-04T21:41:23Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 237 Sb address +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 3478 Sb address +2025-11-04T21:41:24Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 196 Sb address +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: moved 30 MM forward +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 18 Sb address +2025-11-04T21:41:24Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 3632 Sb address +2025-11-04T21:41:24Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: moved 3 MM forward +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [DMAOptimizationBase]: SB Rotation rotated 3 Sb address +2025-11-04T21:41:24Z USER 9624 (nc01/sg00) [ModuleForkPass]: address_rotation_sb finished after 4.425 seconds +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2899mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144253 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:24Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=36255 blocks=1 instructions=144253 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:24Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:24Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 2 Sb address +2025-11-04T21:41:25Z INFO 9624 (nc00/sg00) [DMAOptimizationBase]: SB Rotation rotated 12 Sb address +2025-11-04T21:41:25Z USER 9624 (nc00/sg00) [ModuleForkPass]: address_rotation_sb finished after 4.982 seconds +2025-11-04T21:41:25Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2997mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:25Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:25Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:25Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:25Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS,PSUM,SB} +2025-11-04T21:41:25Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:25Z USER 9624 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.915 seconds +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 3050mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144253 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:25Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=36255 blocks=1 instructions=144253 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:25Z USER 9624 (nc01/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.195 seconds +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 3003mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144253 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:25Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=36255 blocks=1 instructions=144253 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:25Z INFO 9624 (nc01/sg00) [build_flow_deps]: Start build fdeps. Invocation: 5Tue Nov 4 21:41:25 2025 +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [build_flow_deps]: Allocs: 36255 instructions: 144253 +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [build_flow_deps]: Build fdeps inserted 366358 edges +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [build_flow_deps]: Done build fdeps 366358 Tue Nov 4 21:41:26 2025 +2025-11-04T21:41:26Z USER 9624 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.998 seconds +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 3035mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:26Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running anti_dependency_analyzer +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to anti_dependency_analyzer: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: Analysis types: {DRAM,ALIAS} +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [AntiDependencyAnalyzer]: DRAM size: 25769803776 num-bins: 24 bin-size: 1073741824 +2025-11-04T21:41:26Z USER 9624 (nc00/sg00) [ModuleForkPass]: anti_dependency_analyzer finished after 0.134 seconds +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2837mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:26Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running dep_opt +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to dep_opt: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [build_flow_deps]: Start build fdeps. Invocation: 6Tue Nov 4 21:41:26 2025 +2025-11-04T21:41:26Z USER 9624 (nc01/sg00) [ModuleForkPass]: dep_opt finished after 0.617 seconds +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2837mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [build_flow_deps]: Allocs: 37694 instructions: 146998 +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144253 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:26Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=36255 blocks=1 instructions=144253 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬───────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼───────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 177 │ 131072 │ +│ Load │ Const -> Internal │ 5 │ 165120 │ +│ Load │ ExternalInput -> Internal │ 2485 │ 1800063040 │ +│ Load │ Internal │ 72 │ 1948678 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 340 │ 1787904 │ +└──────────────┴───────────────────────────┴───────┴────────────┘ + +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 1 │ +│ 4 │ 66 │ +│ 8 │ 7 │ +│ 16 │ 6 │ +│ 32 │ 62 │ +│ 64 │ 5 │ +│ 88 │ 3 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 304 │ +│ 1024 │ 2 │ +│ 2048 │ 85 │ +│ 4096 │ 1193 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 953 │ +│ 16400 │ 8 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ReportStats]: MM Stats: #MatMults 111965 #MatMult-Transposes 35796 +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ReportStats]: IO Tensor size combined: 5790091456 +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌─────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├─────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60809_i1 │ Internal │ bfloat16 │ 3153920 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ all_gather.1_nostride_60911_i10 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i12 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i9 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i11 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i14 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i13 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i8 │ Internal │ bfloat16 │ 2099200 │ +└─────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:26Z USER 9624 (nc01/sg00) [ModuleForkPass]: report_stats finished after 0.052 seconds +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2832mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:26Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144253 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [build_flow_deps]: Build fdeps inserted 385924 edges +2025-11-04T21:41:26Z INFO 9624 (nc00/sg00) [build_flow_deps]: Done build fdeps 385924 Tue Nov 4 21:41:26 2025 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: dep_opt finished after 0.532 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2834mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running report_stats +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to report_stats: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ReportStats]: Data Movement Statistics: sg0000 +┌──────────────┬────────────────────────────┬───────┬────────────┐ +│ Instruction │ Kind │ Count │ Bytes │ +├──────────────┼────────────────────────────┼───────┼────────────┤ +│ DMACopy │ ExternalInput -> Internal │ 1 │ 311164928 │ +│ DMACopy │ Internal │ 208 │ 139264 │ +│ DMACopy │ Internal -> ExternalOutput │ 224 │ 7516192768 │ +│ Load │ Const -> Internal │ 10 │ 2678024 │ +│ Load │ ExternalInput -> Internal │ 2486 │ 1800063072 │ +│ Load │ Internal │ 88 │ 4469830 │ +│ Load (Spill) │ ExternalInput -> Internal │ 3 │ 264 │ +│ Load (Spill) │ Internal │ 12 │ 19947520 │ +│ Save │ Internal │ 389 │ 2820170 │ +│ Save │ Internal -> ExternalOutput │ 1 │ 32 │ +└──────────────┴────────────────────────────┴───────┴────────────┘ + +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ReportStats]: +┌─────────────────────┬───────┐ +│ Bytes per partition │ Count │ +├─────────────────────┼───────┤ +│ 2 │ 2 │ +│ 4 │ 74 │ +│ 8 │ 16 │ +│ 16 │ 6 │ +│ 32 │ 65 │ +│ 64 │ 7 │ +│ 88 │ 3 │ +│ 128 │ 2 │ +│ 256 │ 2 │ +│ 384 │ 1 │ +│ 512 │ 305 │ +│ 1024 │ 17 │ +│ 2048 │ 85 │ +│ 4096 │ 1221 │ +│ 6144 │ 224 │ +│ 6160 │ 4 │ +│ 8192 │ 953 │ +│ 16384 │ 2 │ +│ 16400 │ 8 │ +│ 18992 │ 2 │ +│ 1048576 │ 224 │ +└─────────────────────┴───────┘ + +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ReportStats]: MM Stats: #MatMults 112809 #MatMult-Transposes 36580 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ReportStats]: IO Tensor size combined: 5790091456 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ReportStats]: IO Tensor Statistics: +┌────────────────────┬───────────────┬──────────┬──────────────┐ +│ Largest IO Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────┼───────────────┼──────────┼──────────────┤ +│ input369 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input60 │ ExternalInput │ bfloat16 │ 311164928 │ +│ input8 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input11 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input4 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input7 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input10 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input5 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input13 │ ExternalInput │ bfloat16 │ 33554432 │ +│ input12 │ ExternalInput │ bfloat16 │ 33554432 │ +└────────────────────┴───────────────┴──────────┴──────────────┘ + +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ReportStats]: Large (Internal) Tensor Statistics: +┌────────────────────────────────┬──────────┬──────────┬──────────────┐ +│ Largest Tensors │ Kind │ Src Type │ Size (Bytes) │ +├────────────────────────────────┼──────────┼──────────┼──────────────┤ +│ all_gather.1_nostride_60809_i0 │ Internal │ bfloat16 │ 3153920 │ +│ -t80743 │ Internal │ float32 │ 2562048 │ +│ -t80737 │ Internal │ float32 │ 2562048 │ +│ convert.656 │ Internal │ float32 │ 2430976 │ +│ -t80740 │ Internal │ float32 │ 2430976 │ +│ constant.13 │ Const │ float32 │ 2430976 │ +│ all_gather.1_nostride_60911_i3 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i2 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i1 │ Internal │ bfloat16 │ 2099200 │ +│ all_gather.1_nostride_60911_i0 │ Internal │ bfloat16 │ 2099200 │ +└────────────────────────────────┴──────────┴──────────┴──────────────┘ + +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: report_stats finished after 0.048 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2822mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 146998 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 6.744 seconds +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: curr_vmrss: 2822mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: Running assign_trigger_engine +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Inputs to assign_trigger_engine: modules=2 functions=2 allocs=73949 blocks=2 instructions=291251 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [AssignTriggerEngine]: Assigned trigger engine for 460 DMA instructions. Moved 71 DMA instructions to CC's engines. +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [AssignTriggerEngine]: Assigned trigger engine for 410 DMA instructions. Moved 70 DMA instructions to CC's engines. +2025-11-04T21:41:27Z INFO 9624 [AssignTriggerEngine]: Limiting IO queue to SP only +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: assign_trigger_engine finished after 0.159 seconds +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: curr_vmrss: 2826mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 291251 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291251 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running sync_before_global_cc +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=37694 blocks=1 instructions=146998 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to sync_before_global_cc: modules=1 functions=1 allocs=36255 blocks=1 instructions=144253 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.044 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: sync_before_global_cc finished after 0.045 seconds +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.057 seconds +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: Running assign_hwdge_engine +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Inputs to assign_hwdge_engine: modules=2 functions=2 allocs=73949 blocks=2 instructions=291369 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: assign_hwdge_engine finished after 0.046 seconds +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 291369 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291369 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running alloc_queues +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to alloc_queues: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 6 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 8 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 226 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 250 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 77 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 28 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2491 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 9 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: alloc_queues finished after 0.024 seconds +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [AllocQueues]: Alloc Queue info: +┌───────────────────┬────────────────┬────────────┬────────────┬──────────────────┐ +│ Name │ DMAQueue::Type │ Engine │ Num Queues │ Num instructions │ +├───────────────────┼────────────────┼────────────┼────────────┼──────────────────┤ +│ qSPIO0 │ input │ SP │ 16 │ 8 │ +│ qSPSpillReload0 │ data │ SP │ 16 │ 19 │ +│ qPoolSpillReload0 │ data │ Pool │ 16 │ 260 │ +│ qActSpillReload0 │ data │ Activation │ 16 │ 248 │ +│ qDVESpillReload0 │ data │ DVE │ 16 │ 117 │ +│ qSPDynamicHW │ dynamic │ SP │ 16 │ 39 │ +│ qPoolDynamic │ dynamic │ Pool │ 16 │ 2716 │ +│ qActDynamicHW │ dynamic │ Activation │ 16 │ 15 │ +└───────────────────┴────────────────┴────────────┴────────────┴──────────────────┘ + +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: alloc_queues finished after 0.025 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running chain_dma_transposes +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to chain_dma_transposes: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.042 seconds +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: chain_dma_transposes finished after 0.043 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.082 seconds +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291369 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:27Z USER 9624 (nc01) [CoreForkPass]: Running insert_dma_switch_queue_instance +2025-11-04T21:41:27Z INFO 9624 (nc00) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.002 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc01) [CoreForkPass]: Inputs to insert_dma_switch_queue_instance: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 (nc01) [CoreForkPass]: insert_dma_switch_queue_instance finished after 0.003 seconds +2025-11-04T21:41:27Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: nc_parallel_pass finished after 0.012 seconds +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291369 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running prefetch_scheduling_after_sched +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.002 seconds +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to prefetch_scheduling_after_sched: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: prefetch_scheduling_after_sched finished after 0.003 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running lower_control +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to lower_control: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [LowerControl]: EraseInterBbDeps removed 0 inter-BB deps +2025-11-04T21:41:27Z USER 9624 (nc01/sg00) [ModuleForkPass]: lower_control finished after 0.210 seconds +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z USER 9624 (nc00/sg00) [ModuleForkPass]: lower_control finished after 0.218 seconds +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.234 seconds +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: curr_vmrss: 2823mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:27Z USER 9624 [BackendPassManager]: Running nc_parallel_pass +2025-11-04T21:41:27Z INFO 9624 [BackendPassManager]: Inputs to nc_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=291369 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z USER 9624 (nc00) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:27Z USER 9624 (nc01) [CoreForkPass]: Running dep_reduction +2025-11-04T21:41:27Z INFO 9624 (nc00) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:27Z INFO 9624 (nc01) [CoreForkPass]: Inputs to dep_reduction: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [DepReduction]: Start Dependency Reduction +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [DepReduction]: Cacheing dependencies for debug info +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:27Z INFO 9624 (nc01/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [DepReduction]: Processing async instrs... +2025-11-04T21:41:27Z INFO 9624 (nc00/sg00) [DepReduction]: Processing secondary edges per engine... +2025-11-04T21:41:28Z INFO 9624 (nc01/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 117428 +2025-11-04T21:41:28Z INFO 9624 (nc00/sg00) [DepReduction]: Processing secondary edges per engine, Done. Num edges removed 128806 +2025-11-04T21:41:28Z INFO 9624 (nc01/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 121670 +2025-11-04T21:41:28Z INFO 9624 (nc01/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 121670 +2025-11-04T21:41:28Z INFO 9624 (nc00/sg00) [DepReduction]: Processing redundant descendants, Done. Num edges removed 133821 +2025-11-04T21:41:28Z INFO 9624 (nc00/sg00) [DepReduction]: Processing async instrs, Done. Num edges removed 133821 +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [DepReduction]: Finished dependency reduction: 845823 removed, new total 48867 +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: dep_reduction finished after 1.741 seconds +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3031mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: lower_dynamic_dma finished after 0.029 seconds +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3013mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: legalize_dynamic_dma finished after 0.090 seconds +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: optimize_queue_switch finished after 0.023 seconds +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3022mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144312 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=36255 blocks=1 instructions=144312 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [DepReduction]: Num Async removed: 0 +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [DepReduction]: Finished dependency reduction: 886878 removed, new total 51194 +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [DepReduction]: Finished Dependency Reduction +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: dep_reduction finished after 1.905 seconds +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3022mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: Running lower_dynamic_dma +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Inputs to lower_dynamic_dma: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: lower_dynamic_dma finished after 0.030 seconds +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3022mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: Running legalize_dynamic_dma +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Inputs to legalize_dynamic_dma: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z INFO 9624 (nc01/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 2368/2368 (100% DGE) + power-of-2 partition : 2368/2373 (99.7893% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2369/2374 (99.7894% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 37/37 (100% DGE) + power-of-2 partition : 37/429 (8.62471% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 37/429 (8.62471% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/0 + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/0 + CopyMode + CCE : 169 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 9/9 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: lower_dma finished after 0.106 seconds +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3022mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144313 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=36255 blocks=1 instructions=144313 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [LegalizeDynamicDMA]: Legalize Dynamic DMA scanned 0 DGE instructions +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [LegalizeDynamicDMA]: After Legalize Dynamic DMA, 0 DGE instructions were scanned +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [LegalizeDynamicDMA]: +┌───────────┬───────────────────────────────┬────────────────────────────┐ +│ Sub-Pass │ Illegal Instructions Detected │ New Instructions Generated │ +├───────────┼───────────────────────────────┼────────────────────────────┤ +│ Peeling │ 0 │ 0 │ +│ Unrolling │ 0 │ 0 │ +│ Splitting │ 0 │ 0 │ +└───────────┴───────────────────────────────┴────────────────────────────┘ + +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: legalize_dynamic_dma finished after 0.090 seconds +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: Running optimize_queue_switch +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: expand_all_engine finished after 0.029 seconds +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Inputs to optimize_queue_switch: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144313 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=36255 blocks=1 instructions=144313 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [OptimizeQueueSwitch]: Optimize queue switch has replaced 0 total SQI Instructions with RQI +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: optimize_queue_switch finished after 0.025 seconds +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147057 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: Running lower_dma +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Inputs to lower_dma: modules=1 functions=1 allocs=37694 blocks=1 instructions=147057 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z INFO 9624 (nc00/sg00) [LowerDMA]: lower_dma metrics start + IO + Copy (DGE/DMA) + 128 partition : 2368/2368 (100% DGE) + power-of-2 partition : 2368/2375 (99.7053% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 2369/2376 (99.7054% DGE) + Cast (DGE/DMA) + 128 partition : 57/57 (100% DGE) + power-of-2 partition : 113/114 (99.1228% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 113/114 (99.1228% DGE) + Spill/Reload + Copy (DGE/DMA) + 128 partition : 54/54 (100% DGE) + power-of-2 partition : 54/499 (10.8216% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 54/499 (10.8216% DGE) + Cast (DGE/DMA) + 128 partition : 0/0 + power-of-2 partition : 0/2 (0% DGE) + > 3 dimensional : 0/0 + non-integer desc size : 0/0 + total : 0/2 (0% DGE) + CopyMode + CCE : 197 + Transpose : 0 + Replicate : 0 + Dynamic (DGE/DMA) + scalar : 0/0 + vector : 234/234 (100% DGE) + Opcode + ReadVarAddr : 0 + IndirectLoad : 0 + IndirectSave : 0 + IndirectSaveAccumulate : 0 + DstReduceDGE : 0 +lower_dma metrics end +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: lower_dma finished after 0.109 seconds +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147059 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: Running expand_all_engine +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Inputs to expand_all_engine: modules=1 functions=1 allocs=37694 blocks=1 instructions=147059 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: expand_all_engine finished after 0.029 seconds +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147059 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc00) [CoreForkPass]: Running alloc_semaphores +2025-11-04T21:41:29Z INFO 9624 (nc00) [CoreForkPass]: Inputs to alloc_semaphores: modules=1 functions=1 allocs=37694 blocks=1 instructions=147059 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: alloc_semaphores finished after 0.201 seconds +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144313 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:29Z USER 9624 (nc01) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:29Z INFO 9624 (nc01) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=36255 blocks=1 instructions=144313 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: alloc_semaphores finished after 0.203 seconds +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147059 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: Running expand_inst_late +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Inputs to expand_inst_late: modules=1 functions=1 allocs=37694 blocks=1 instructions=147059 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: expand_inst_late finished after 0.228 seconds +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144322 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=36255 blocks=1 instructions=144322 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [SeqInstOpt]: Removing 7 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: seq_inst_opt finished after 0.023 seconds +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 144315 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=36255 blocks=1 instructions=144315 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: lower_sync finished after 0.059 seconds +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 149283 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: Running lower_act +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=36255 blocks=1 instructions=149283 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: lower_act finished after 0.027 seconds +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3021mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 149424 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=36255 blocks=1 instructions=149424 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: expand_inst_late finished after 0.235 seconds +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3024mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147293 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: Running seq_inst_opt +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Inputs to seq_inst_opt: modules=1 functions=1 allocs=37694 blocks=1 instructions=147293 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z INFO 9624 (nc00/sg00) [SeqInstOpt]: Removing 230 unnecessary InstRegisterMove instruction(s) from Block1 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: seq_inst_opt finished after 0.025 seconds +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3023mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 147063 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: Running lower_sync +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Inputs to lower_sync: modules=1 functions=1 allocs=37694 blocks=1 instructions=147063 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: lower_sync finished after 0.062 seconds +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3037mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 152601 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: Running lower_act +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Inputs to lower_act: modules=1 functions=1 allocs=37694 blocks=1 instructions=152601 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: lower_act finished after 0.028 seconds +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3045mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 152745 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: Running lower_dve +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Inputs to lower_dve: modules=1 functions=1 allocs=37694 blocks=1 instructions=152745 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z INFO 9624 (nc00/sg00) [LowerDVE]: Loading DVE opcodes table dve_info.json from /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/dve/dve_bin_gen3/dve_info.json +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: lower_dve finished after 0.444 seconds +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3095mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 149424 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=36255 blocks=1 instructions=149424 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: lower_ap finished after 0.033 seconds +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3095mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 149424 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z USER 9624 (nc01) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:30Z INFO 9624 (nc01) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=36255 blocks=1 instructions=149424 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:30Z INFO 9624 (nc01/sg00) [REG_Allocator]: size = 2 +2025-11-04T21:41:30Z INFO 9624 []: find first defs for local reg +2025-11-04T21:41:30Z INFO 9624 []: find first defs for global reg +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: lower_dve finished after 0.465 seconds +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3090mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:30Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 152745 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:30Z USER 9624 (nc00) [CoreForkPass]: Running lower_ap +2025-11-04T21:41:31Z INFO 9624 (nc00) [CoreForkPass]: Inputs to lower_ap: modules=1 functions=1 allocs=37694 blocks=1 instructions=152745 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 (nc00) [CoreForkPass]: lower_ap finished after 0.033 seconds +2025-11-04T21:41:31Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3035mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 152745 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 (nc00) [CoreForkPass]: Running coloring_allocator_reg +2025-11-04T21:41:31Z INFO 9624 (nc00) [CoreForkPass]: Inputs to coloring_allocator_reg: modules=1 functions=1 allocs=37694 blocks=1 instructions=152745 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: Allocating functions +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [ColoringAllocator::Rep]: linearize and check +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: allocating REG +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: main loop iteration 1 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: renumber registers +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: size = 4 +2025-11-04T21:41:31Z INFO 9624 []: find first defs for local reg +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:31Z INFO 9624 []: find first defs for global reg +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: lo = 2 +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: total = 2 +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:31Z USER 9624 (nc01) [CoreForkPass]: coloring_allocator_reg finished after 0.433 seconds +2025-11-04T21:41:31Z INFO 9624 (nc01) [CoreForkPass]: curr_vmrss: 3068mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z INFO 9624 (nc01) [CoreForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 149424 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: live range analysis +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: find costs +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: simplify interference graph +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: initialize low and high +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: lo = 4 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: hi = 0 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: inf = 0 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: total = 4 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: simplify +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: new candidates = 0 +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: select ranges +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: no more spills +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: REG score = 0 (lower is better) +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: Spilling from REG cost about 0 cycles +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [REG_Allocator]: 0% REG utilization after allocation +2025-11-04T21:41:31Z USER 9624 (nc00) [CoreForkPass]: coloring_allocator_reg finished after 0.446 seconds +2025-11-04T21:41:31Z INFO 9624 (nc00) [CoreForkPass]: curr_vmrss: 3069mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z INFO 9624 (nc00) [CoreForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 152745 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 [CoreForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: nc_parallel_pass finished after 3.853 seconds +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: curr_vmrss: 2985mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: Running vnc_remote_addr_map +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: Inputs to vnc_remote_addr_map: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: vnc_remote_addr_map finished after 0.017 seconds +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: curr_vmrss: 2923mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 302169 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: Running vnc_link +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: Inputs to vnc_link: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z INFO 9624 [VncLink]: Found 0 remote updates +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: vnc_link finished after 0.006 seconds +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: curr_vmrss: 2923mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 302169 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:31Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running birverifier +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=37694 blocks=1 instructions=152745 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to birverifier: modules=1 functions=1 allocs=36255 blocks=1 instructions=149424 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:31Z USER 9624 (nc01/sg00) [ModuleForkPass]: birverifier finished after 0.311 seconds +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 2939mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 149424 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:31Z USER 9624 (nc00/sg00) [ModuleForkPass]: birverifier finished after 0.320 seconds +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 2921mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 152745 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 0.332 seconds +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: curr_vmrss: 2919mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:31Z USER 9624 [BackendPassManager]: Running subgraph_parallel_pass +2025-11-04T21:41:31Z INFO 9624 [BackendPassManager]: Inputs to subgraph_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:31Z USER 9624 (sg00) [SubgraphForkPass]: Running lnc_verifier +2025-11-04T21:41:31Z INFO 9624 (sg00) [SubgraphForkPass]: Inputs to lnc_verifier: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:32Z USER 9624 (sg00) [SubgraphForkPass]: lnc_verifier finished after 0.060 seconds +2025-11-04T21:41:32Z INFO 9624 (sg00) [SubgraphForkPass]: curr_vmrss: 2922mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:32Z INFO 9624 (sg00) [SubgraphForkPass]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 302169 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:32Z USER 9624 [SubgraphForkPass]: Compilation status: Total subgraphs: 1, Passed: 1, Failed: 0 +2025-11-04T21:41:32Z USER 9624 [BackendPassManager]: subgraph_parallel_pass finished after 0.074 seconds +2025-11-04T21:41:32Z INFO 9624 [BackendPassManager]: curr_vmrss: 2919mb, ru_maxrss: 3103mb (delta=0mb) +2025-11-04T21:41:32Z USER 9624 [BackendPassManager]: Running mod_parallel_pass +2025-11-04T21:41:32Z INFO 9624 [BackendPassManager]: Inputs to mod_parallel_pass: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:32Z USER 9624 (nc00/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:32Z USER 9624 (nc01/sg00) [ModuleForkPass]: Running codegen +2025-11-04T21:41:32Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=36255 blocks=1 instructions=149424 Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:32Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Inputs to codegen: modules=1 functions=1 allocs=37694 blocks=1 instructions=152745 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:32Z INFO 9624 (nc01/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:32Z INFO 9624 (nc01/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64244 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249505 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:32Z INFO 9624 (nc00/sg00) [Codegen]: Total un-allocated DRAM tensors by kind: +2025-11-04T21:41:32Z INFO 9624 (nc00/sg00) [Codegen]: +┌────────────────┬─────────────┐ +│ TensorKind │ Size (GB) │ +├────────────────┼─────────────┤ +│ ExternalInput │ 3.64244 │ +│ ExternalOutput │ 2.98023e-08 │ +│ Const │ 0.00249506 │ +└────────────────┴─────────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ MATMUL │ 113494 │ +│ LDWEIGHTS │ 113494 │ +│ CAST │ 17902 │ +│ COPY │ 5320 │ +│ EVENT_SEMAPHORE │ 4968 │ +│ ACTIVATE │ 2540 │ +│ UNKNOWN(0xd4) │ 2528 │ +│ TENSOR_TENSOR │ 1553 │ +│ UNKNOWN(0xd8) │ 589 │ +│ PSEUDO_DMA_TRIGGER │ 567 │ +│ UNKNOWN(0xe8) │ 450 │ +│ TENSOR_SCALAR │ 273 │ +│ MEMSET │ 257 │ +│ ACT_TABLE_LOAD │ 141 │ +│ TENSOR_REDUCE │ 114 │ +│ TENSOR_SCALAR_ADDR │ 113 │ +│ UNKNOWN(0xda) │ 68 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 57 │ +│ LOAD_MASK_SELECT │ 24 │ +│ STREAM_SHUFFLE │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 2 │ +│ IOTA │ 1 │ +└─────────────────────┴────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 5514 │ +│ Scalar │ 19287 │ +│ Tensor │ 229132 │ +│ SyncDMA │ 0 │ +│ Vector │ 10560 │ +│ Sync │ 55 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [Codegen]: Instruction Stats: +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [Codegen]: +┌─────────────────────┬────────┐ +│ Opcode │ Count │ +├─────────────────────┼────────┤ +│ MATMUL │ 114398 │ +│ LDWEIGHTS │ 114398 │ +│ CAST │ 17902 │ +│ COPY │ 5554 │ +│ EVENT_SEMAPHORE │ 5538 │ +│ UNKNOWN(0xd4) │ 2770 │ +│ ACTIVATE │ 2547 │ +│ TENSOR_TENSOR │ 1554 │ +│ TENSOR_SCALAR_ADDR │ 674 │ +│ PSEUDO_DMA_TRIGGER │ 652 │ +│ UNKNOWN(0xd8) │ 589 │ +│ UNKNOWN(0xe8) │ 450 │ +│ IOTA │ 394 │ +│ UNKNOWN(0xda) │ 293 │ +│ TENSOR_SCALAR │ 275 │ +│ MEMSET │ 269 │ +│ POOL_BUFFER_LOAD │ 240 │ +│ GATHER │ 240 │ +│ ACT_TABLE_LOAD │ 144 │ +│ DVE_READ_INDICES │ 128 │ +│ MATCH_REPLACE8 │ 128 │ +│ MATCH_VALUE_LOAD │ 128 │ +│ MAX8 │ 128 │ +│ TENSOR_REDUCE │ 119 │ +│ UNKNOWN(0xd9) │ 59 │ +│ RECIPROCAL │ 59 │ +│ LOAD_MASK_SELECT │ 25 │ +│ STREAM_SHUFFLE │ 24 │ +│ PSEUDO_BRANCH_LABEL │ 5 │ +│ MOVE │ 4 │ +│ UNKNOWN(0xe5) │ 2 │ +│ STREAM_TRANSPOSE │ 1 │ +│ NOP │ 1 │ +│ TENSOR_SCALAR │ 1 │ +│ RNG │ 1 │ +└─────────────────────┴────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [Codegen]: +┌────────────┬────────┐ +│ Engine │ Count │ +├────────────┼────────┤ +│ Unassigned │ 0 │ +│ GPSIMD │ 7810 │ +│ Scalar │ 19707 │ +│ Tensor │ 230974 │ +│ SyncDMA │ 0 │ +│ Vector │ 11115 │ +│ Sync │ 93 │ +│ All │ 0 │ +└────────────┴────────┘ + +2025-11-04T21:41:33Z USER 9624 (nc01/sg00) [Codegen]: isa_gen finished after 1.099 seconds +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 4112 │ +│ qDVESpillReload0 │ 1232 │ +│ qPoolSpillReload0 │ 39513 │ +│ qSPIO0 │ 33 │ +│ qSPSpillReload0 │ 196 │ +└───────────────────┴────────────────┘ + +Total descriptors: 45086 (0.000671834 GB) +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [Codegen]: Tensors with largest descriptor count: +┌──────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├──────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ 38666.56447_i572 │ Internal │ float32 │ 1 │ +│ 38635.59714_i1 │ Internal │ bfloat16 │ 1 │ +│ t44615_45753_i0_remote_0 │ Internal │ bfloat16 │ 1 │ +│ 38666.56447_i480 │ Internal │ float32 │ 1 │ +│ 38666.56447_i502 │ Internal │ float32 │ 1 │ +│ 38293.59638_i1 │ Internal │ bfloat16 │ 1 │ +│ 38666.56447_i573 │ Internal │ float32 │ 1 │ +│ split_16 │ Internal │ float32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└──────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:33Z USER 9624 (nc01/sg00) [Codegen]: dma_desc_gen finished after 0.010 seconds +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:33Z USER 9624 (nc00/sg00) [Codegen]: isa_gen finished after 1.132 seconds +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [Codegen]: Number of DMA descriptors on each queue instance: +┌───────────────────┬────────────────┐ +│ Queue Instance │ RT Descriptors │ +├───────────────────┼────────────────┤ +│ qActSpillReload0 │ 4052 │ +│ qDVESpillReload0 │ 1844 │ +│ qPoolSpillReload0 │ 50340 │ +│ qSPIO0 │ 51 │ +│ qSPSpillReload0 │ 330 │ +└───────────────────┴────────────────┘ + +Total descriptors: 56617 (0.000843659 GB) +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [Codegen]: Number of DMA engines used by each queue: +┌───────────────────┬──────────────────────┐ +│ Queue │ DMA Engines │ +├───────────────────┼──────────────────────┤ +│ qSPIO0 │ 16 │ +│ qSPDynamicHW │ 16 │ +│ qPoolDynamic │ 16 │ +│ qActDynamicHW │ 16 │ +│ qSPSpillReload0 │ 16 │ +│ qPoolSpillReload0 │ 16 │ +│ qActSpillReload0 │ 16 │ +│ qDVESpillReload0 │ 16 │ +├───────────────────┼──────────────────────┤ +│ TOTAL │ 128 (must be <= 176) │ +└───────────────────┴──────────────────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [Codegen]: Tensors with largest descriptor count: +┌────────────────────────┬───────────────┬──────────┬──────────────────┐ +│ Tensor Name │ Kind │ Src Type │ Descriptor Count │ +├────────────────────────┼───────────────┼──────────┼──────────────────┤ +│ custom_call.143 │ Internal │ float32 │ 2 │ +│ split_17 │ Internal │ float32 │ 2 │ +│ split_16 │ Internal │ int32 │ 2 │ +│ input2 │ ExternalInput │ int32 │ 2 │ +│ get_tuple_element.5 │ Internal │ float32 │ 2 │ +│ rng.1 │ Internal │ float32 │ 2 │ +│ broadcast_in_dim.17_i0 │ Internal │ int32 │ 2 │ +│ custom_call.142 │ Internal │ float32 │ 2 │ +│ input3 │ ExternalInput │ float32 │ 3 │ +│ convert.656 │ Internal │ float32 │ 297 │ +└────────────────────────┴───────────────┴──────────┴──────────────────┘ + +2025-11-04T21:41:33Z USER 9624 (nc00/sg00) [Codegen]: dma_desc_gen finished after 0.011 seconds +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [Codegen]: Generating debug info +2025-11-04T21:41:33Z WARNING 9624 (nc01/sg00) [Codegen]: Found 344 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:33Z USER 9624 (nc01/sg00) [Codegen]: debug_info_gen finished after 0.327 seconds +2025-11-04T21:41:33Z WARNING 9624 (nc00/sg00) [Codegen]: Found 417 instructions with more than 100 dependencies. For each such instruction, skipping writing more than 100 dependencies into the built-in NEFF debug info to prevent excessive compile time and NEFF size. For those instructions, the Neuron profiler will not display the skipped dependencies. +2025-11-04T21:41:33Z USER 9624 (nc00/sg00) [Codegen]: debug_info_gen finished after 0.332 seconds +2025-11-04T21:41:33Z USER 9624 (nc01/sg00) [ModuleForkPass]: codegen finished after 1.489 seconds +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [ModuleForkPass]: curr_vmrss: 3266mb, ru_maxrss: 3266mb (delta=163mb) +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 36255 memory location(s), 1 block(s), and 149424 instruction(s). Max writers: 299 Max Readers: 35741 +2025-11-04T21:41:33Z USER 9624 (nc00/sg00) [ModuleForkPass]: codegen finished after 1.540 seconds +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [ModuleForkPass]: curr_vmrss: 3150mb, ru_maxrss: 3266mb (delta=163mb) +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [ModuleForkPass]: Output has 1 module(s), 1 function(s), 37694 memory location(s), 1 block(s), and 152745 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:33Z USER 9624 [ModuleForkPass]: Compilation status: Total modules: 2, Passed: 2, Failed: 0 +2025-11-04T21:41:33Z USER 9624 [BackendPassManager]: mod_parallel_pass finished after 1.578 seconds +2025-11-04T21:41:33Z INFO 9624 [BackendPassManager]: curr_vmrss: 2942mb, ru_maxrss: 3266mb (delta=163mb) +2025-11-04T21:41:33Z USER 9624 [BackendPassManager]: Running hbm_usage +2025-11-04T21:41:33Z INFO 9624 [BackendPassManager]: Inputs to hbm_usage: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 480.000B │ 126.156KB │ +│ CCE │ 0.000B │ 674.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 2.000KB │ 161.000KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc00/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.668GB │ +│ Model Code │ 16.461MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.039MB │ +│ DMA Ring IO │ 2.469KB │ +│ DMA Ring Spill │ 961.828KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [HBMUsage]: +┌───────────────┬──────────┬───────────────────┐ +│ DMA Ring Type │ I/O Size │ Spill/Reload Size │ +├───────────────┼──────────┼───────────────────┤ +│ Copy │ 416.000B │ 114.594KB │ +│ CCE │ 0.000B │ 506.672KB │ +│ Transpose │ 0.000B │ 0.000B │ +│ Replicate │ 0.000B │ 0.000B │ +│ Overhead │ 1.500KB │ 140.250KB │ +└───────────────┴──────────┴───────────────────┘ + +2025-11-04T21:41:33Z INFO 9624 (nc01/sg00) [HBMUsage]: +┌─────────────────────┬───────────┐ +│ DRAM Memory Usage │ Size │ +├─────────────────────┼───────────┤ +│ Total: │ 3.667GB │ +│ Model Code │ 16.147MB │ +│ Model Constants │ 2.555MB │ +│ Unallocated Tensors │ 3.642GB │ +│ Allocated Tensors │ 6.039MB │ +│ DMA Ring IO │ 1.906KB │ +│ DMA Ring Spill │ 761.516KB │ +└─────────────────────┴───────────┘ + +2025-11-04T21:41:33Z INFO 9624 [HBMUsage]: Total estimated HBM usage is: 3.690GB +2025-11-04T21:41:33Z USER 9624 [BackendPassManager]: hbm_usage finished after 0.015 seconds +2025-11-04T21:41:33Z INFO 9624 [BackendPassManager]: curr_vmrss: 2907mb, ru_maxrss: 3266mb (delta=0mb) +2025-11-04T21:41:33Z INFO 9624 [BackendPassManager]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 302169 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:33Z USER 9624 [BackendPassManager]: Running neff_packager +2025-11-04T21:41:33Z INFO 9624 [BackendPassManager]: Inputs to neff_packager: modules=2 functions=2 allocs=73949 blocks=2 instructions=302169 Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:33Z WARNING 9624 [NeffFileWriter]: writeKelp missing file /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704/metrics.json +2025-11-04T21:41:34Z WARNING 9624 [NeffFileWriter]: writeKelp missing file /local/p4clients/pkgbuild-const/workspace/build/KaenaCompiler/KaenaCompiler-2.x.207535.0/AL2_x86_64/DEV.STD.PTHREAD/build/private/_skbuild/linux-x86_64-3.10/cmake-build/neuronxcc/walrus/neff_packager/MetricMetadata.json +2025-11-04T21:41:34Z INFO 9624 [NeffFileWriter]: Neff will be written to: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff +2025-11-04T21:41:34Z INFO 9624 [NeffFileWriter]: IR signature: 420b94d904f2ad7b10cf2f69c261ab78 for neff artifacts +2025-11-04T21:41:34Z USER 9624 [BackendPassManager]: neff_packager finished after 0.548 seconds +2025-11-04T21:41:34Z INFO 9624 [BackendPassManager]: curr_vmrss: 2908mb, ru_maxrss: 3266mb (delta=0mb) +2025-11-04T21:41:34Z INFO 9624 [BackendPassManager]: Output has 2 module(s), 2 function(s), 73949 memory location(s), 2 block(s), and 302169 instruction(s). Max writers: 299 Max Readers: 36525 +2025-11-04T21:41:34Z INFO 9624 [BackendDriver]: HBM scratchpad usage summary (post-allocation): +┌──────┬───────────┬────────────────────────────────────────────────────────────┬─────────────┐ +│ Core │ Subgraph │ Description │ Value │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc00 │ sg00 │ Peak scratchpad usage: local │ 0.000103 GB │ +│ nc00 │ sg00 │ Peak scratchpad usage: local and shared │ 0.005898 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: local │ 0.000137 GB │ +│ nc00 │ sg00 │ Total size of allocated tensors: shared │ 0.005882 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local │ 0.000103 GB │ +│ nc00 │ Max │ Peak scratchpad usage: local and shared │ 0.005898 GB │ +│ nc00 │ Post-link │ Peak scratchpad usage after intermediate tensor allocation │ 0.000000 GB │ +│ nc00 │ Post-link │ Total size of allocated intermediate tensors │ 0.000000 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ nc01 │ sg00 │ Peak scratchpad usage: local │ 0.000088 GB │ +│ nc01 │ sg00 │ Total size of allocated tensors: local │ 0.000088 GB │ +│ nc01 │ Max │ Peak scratchpad usage: local │ 0.000088 GB │ +├──────┼───────────┼────────────────────────────────────────────────────────────┼─────────────┤ +│ Max │ Max │ Peak scratchpad usage │ 0.005898 GB │ +│ Max │ Max │ Peak scratchpad usage (page-aligned) │ 0.500000 GB │ +└──────┴───────────┴────────────────────────────────────────────────────────────┴─────────────┘ + +2025-11-04T21:41:34Z INFO 9624 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=local (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_18 │ float32 │ 1 │ 0.062500 MB │ +│ split_1 │ uint8 │ 1 │ 0.003906 MB │ +│ split_11 │ uint8 │ 1 │ 0.003906 MB │ +│ split_12 │ uint8 │ 1 │ 0.003906 MB │ +│ split_13 │ uint8 │ 1 │ 0.003906 MB │ +│ split_14 │ uint8 │ 1 │ 0.003906 MB │ +│ split_15 │ uint8 │ 1 │ 0.003906 MB │ +│ split_16 │ int32 │ 1 │ 0.003906 MB │ +│ split_17 │ float32 │ 1 │ 0.003906 MB │ +│ split_4 │ uint8 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:34Z INFO 9624 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc00, subgraph=sg00, addr_space=shared (complete data located at nc00/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Shared_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬──────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼──────────┼───────────────┼─────────────┤ +│ convert.656 │ float32 │ 1 │ 2.320312 MB │ +│ all_reduce.111 │ bfloat16 │ 1 │ 0.031250 MB │ +│ get_tuple_element.1 │ float32 │ 1 │ 0.007812 MB │ +│ get_tuple_element.2 │ uint32 │ 1 │ 0.007812 MB │ +│ all_reduce.112 │ bfloat16 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴──────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:34Z INFO 9624 [BackendDriver]: Largest tensors at peak scratchpad usage, core=nc01, subgraph=sg00, addr_space=local (complete data located at nc01/sg00/memory_analysis_after_coloring_allocator_dram_shared_DRAM_Local_hwm_allocations.csv): +┌────────────────────────────────────────────────────────────────┬─────────┬───────────────┬─────────────┐ +│ Tensor Name │ Type │ # Sub-tensors │ Total Size │ +├────────────────────────────────────────────────────────────────┼─────────┼───────────────┼─────────────┤ +│ split_17 │ float32 │ 1 │ 0.062500 MB │ +│ split_0 │ uint8 │ 1 │ 0.007812 MB │ +│ split_12 │ uint8 │ 1 │ 0.003906 MB │ +│ split_14 │ uint8 │ 1 │ 0.003906 MB │ +│ split_15 │ uint8 │ 1 │ 0.003906 MB │ +│ split_16 │ float32 │ 1 │ 0.003906 MB │ +│ split_8 │ uint8 │ 1 │ 0.003906 MB │ +└────────────────────────────────────────────────────────────────┴─────────┴───────────────┴─────────────┘ + +2025-11-04T21:41:34Z INFO 9624 [BackendDriver]: Backend completed successfully, tearing down. +2025-11-04T21:41:35Z INFO 8881 [job.WalrusDriver.0]: VNCBackend: completed successfully. +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Finished job job.WalrusDriver.0 +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Starting job job.BIRLinker.0 +2025-11-04T21:41:35Z INFO 8881 [job.BIRLinker.0]: Replay this job by calling: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/bin/neuronx-cc compile --framework XLA --state '{"model": ["/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb"], "tensormap": "tensor_map.json", "bir": "bir.json", "lorean_sg_key": null, "input_name_map": null, "output_name_map": null, "constant_tensors": null, "state_dir": "/home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704/nc00/sg00", "state_id": "nc00/sg00"}' --pipeline BIRLinker +2025-11-04T21:41:35Z INFO 8881 [job.BIRLinker.0]: BIRLinker cwd: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704 +2025-11-04T21:41:35Z INFO 8881 [job.BIRLinker.0]: Linking already done. +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Finished job job.BIRLinker.0 +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Starting job job.Kelper.0 +2025-11-04T21:41:35Z INFO 8881 [job.Kelper.0]: Skipping neff generation which was already performed by neff_packager +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Finished job job.Kelper.0 +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Starting job job.NeffWrapper.0 +2025-11-04T21:41:35Z INFO 8881 [job.NeffWrapper.0]: Job NeffWrapper len(in_states) 1 +2025-11-04T21:41:35Z INFO 8881 [job.NeffWrapper.0]: Processing input #0 +2025-11-04T21:41:35Z INFO 8881 [job.NeffWrapper.0]: Start NeffWrapper +2025-11-04T21:41:35Z INFO 8881 [job.NeffWrapper.0]: Executing: /opt/aws_neuronx_venv_pytorch_2_8_nxd_inference/lib/python3.10/site-packages/neuronxcc/starfish/bin/hlo-neff-wrapper --hlo /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb --neff /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff --io_transposes /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704/io_transposes.json --output /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/wrapped_neff.hlo --netlist /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704/hlo_netlist.json +2025-11-04T21:41:35Z INFO 8881 [job.NeffWrapper.0]: Could not open file: /home/ubuntu/qwen3/qwen3-1.7B-TP2-BS8-SEQ4096/token_generation_model/_tp0_bk5/neuronxcc-rwu_y704/hlo_netlist.json +There are no io transposes nor zero-sized parameters. Output will not be produced. +Hlo neff wrapper finished successfully. Have a wonderful day :D + +2025-11-04T21:41:35Z INFO 8881 [job.NeffWrapper.0]: Job #0 finished +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Finished job job.NeffWrapper.0 +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Finished pipeline Pipeline +2025-11-04T21:41:35Z INFO 8881 [pipeline.Pipeline.0]: Job #0 finished +2025-11-04T21:41:35Z INFO 8808 [root]: Subcommand returned with exitcode=0 diff --git a/token_generation_model/_tp0_bk5/metaneff.pb b/token_generation_model/_tp0_bk5/metaneff.pb new file mode 100644 index 0000000000000000000000000000000000000000..6757bc9341c3ee8ec649e245abbb3033c5ddf746 --- /dev/null +++ b/token_generation_model/_tp0_bk5/metaneff.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7e1e45d0aedbf3d21eedb4563bfba43d90b71dff825e2a085895f1073422c1d +size 3988817 diff --git a/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb b/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb new file mode 100644 index 0000000000000000000000000000000000000000..42caa1a2ed3c90fea8b0dd78ecb779c1360bfca2 --- /dev/null +++ b/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.hlo_module.pb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23492667be5a29eeff4959603fe543af3c908826fce9139f187a4bb1a0a9d8e9 +size 4075105 diff --git a/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff b/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff new file mode 100644 index 0000000000000000000000000000000000000000..e0a147185604e99a58fe43296521e09f9cd2c4b0 --- /dev/null +++ b/token_generation_model/_tp0_bk5/model.MODULE_b0c5e51af4aeb4ea04b2+a0432539.neff @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8fd79ce1a7d4475261bf89398504b1054163d9dc97c945814252ecd96994960 +size 11981824 diff --git a/token_generation_model/_tp0_bk5/neuron_config.json b/token_generation_model/_tp0_bk5/neuron_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4af35ce5318206075b4f9aa8a6e9246df6d130a6 --- /dev/null +++ b/token_generation_model/_tp0_bk5/neuron_config.json @@ -0,0 +1,224 @@ +{ + "_attn_implementation_autoset": false, + "_name_or_path": "/home/ubuntu/models/Qwen3-1.7B", + "add_cross_attention": false, + "architectures": [ + "Qwen3ForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attribute_map": {}, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": 151643, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 151645, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "fused_spec_config": null, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 6144, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 40960, + "max_window_layers": 28, + "metadata": null, + "min_length": 0, + "model_type": "qwen3", + "neuron_config": { + "activation_quantization_type": null, + "allow_input_truncation": false, + "apply_seq_ids_mask": false, + "async_mode": false, + "attention_dp_degree": 1, + "attention_dtype": null, + "attn_block_cte_nki_kernel_enabled": false, + "attn_block_tkg_nki_kernel_cache_update": false, + "attn_block_tkg_nki_kernel_cascaded_attention": false, + "attn_block_tkg_nki_kernel_enabled": false, + "attn_cls": { + "__module__": "neuronx_distributed_inference.models.qwen3.modeling_qwen3", + "__name__": "NeuronQwen3Attention" + }, + "attn_kernel_enabled": null, + "attn_tkg_builtin_kernel_enabled": false, + "attn_tkg_nki_kernel_enabled": false, + "batch_size": 8, + "bucket_n_active_tokens": false, + "buckets": [ + 4096 + ], + "cast_type": "config", + "cc_pipeline_tiling_factor": 1, + "chunked_prefill_config": null, + "context_encoding_buckets": null, + "cp_degree": 1, + "ctx_batch_size": 1, + "disable_kv_cache_tiling": false, + "draft_model_modules_to_not_convert": null, + "enable_bucketing": true, + "enable_cte_modular_flow": false, + "enable_eagle_draft_input_norm": false, + "enable_eagle_speculation": false, + "enable_fused_speculation": false, + "enable_long_context_mode": false, + "enable_output_completion_notifications": false, + "enable_spill_reload_dge": false, + "enable_token_tree": false, + "ep_degree": 1, + "expert_mlp_nki_kernel_enabled": null, + "flash_decoding_enabled": false, + "fused_qkv": false, + "fused_rmsnorm_skip_gamma": false, + "is_block_kv_layout": null, + "is_chunked_prefill": false, + "is_continuous_batching": true, + "is_eagle_draft": false, + "is_medusa": false, + "is_prefill_stage": false, + "is_prefix_caching": false, + "k_cache_transposed": false, + "kv_cache_batch_size": 8, + "kv_cache_padding_size": 0, + "kv_cache_quant": false, + "kv_cache_tiling": false, + "layer_boundary_markers": false, + "lm_head_pad": true, + "lm_head_pad_alignment_size": 1, + "local_ranks_size": 2, + "logical_nc_config": 2, + "lora_config": null, + "max_batch_size": 8, + "max_context_length": 4096, + "max_length": 4096, + "max_new_tokens": null, + "medusa_speculation_length": 0, + "medusa_tree": null, + "mlp_kernel_enabled": false, + "mlp_kernel_fuse_residual_add": false, + "modules_to_not_convert": null, + "moe_fused_nki_kernel_enabled": null, + "n_active_tokens": 1, + "n_positions": 4096, + "num_medusa_heads": 0, + "on_cpu": false, + "on_device_sampling_config": { + "deterministic": false, + "do_sample": false, + "dynamic": true, + "global_topk": 256, + "on_device_sampling_config": true, + "temperature": 1.0, + "top_k": 1, + "top_k_kernel_enabled": false, + "top_p": 1.0 + }, + "output_logits": false, + "overrides_torch_dtype": true, + "pa_block_size": 4096, + "pa_num_blocks": 8, + "padding_side": "right", + "pp_degree": 1, + "prefix_buckets": null, + "qk_layernorm": false, + "qkv_kernel_enabled": false, + "qkv_kernel_fuse_residual_add": false, + "qkv_kernel_nbsd_layout": false, + "quantization_dtype": "int8", + "quantization_type": "per_tensor_symmetric", + "quantize_clamp_bound": Infinity, + "quantized": false, + "quantized_checkpoints_path": null, + "quantized_mlp_kernel_enabled": false, + "rmsnorm_quantize_kernel_enabled": false, + "router_topk_nki_kernel_enabled": null, + "rpl_reduce_dtype": null, + "save_sharded_checkpoint": true, + "scratchpad_page_size": null, + "seq_len": 4096, + "seq_len_threshold_for_cc_tiling": 16384, + "sequence_parallel_enabled": false, + "shared_mlp_nki_kernel_enabled": null, + "skip_sharding": false, + "skip_warmup": false, + "spec_batch_size": 8, + "speculation_length": 0, + "start_rank_id": 0, + "strided_context_parallel_kernel_enabled": false, + "target": null, + "tensor_capture_config": null, + "tile_cc": false, + "tkg_batch_size": 8, + "token_generation_buckets": [ + 4096 + ], + "token_tree_config": null, + "torch_dtype": "bfloat16", + "tp_degree": 2, + "vocab_parallel": false, + "weight_gather_seq_len_threshold": 32768, + "weights_to_skip_layout_optimization": [], + "world_size": 2 + }, + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_cores_per_group": 1, + "num_hidden_layers": 28, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 0, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000, + "sep_token_id": null, + "sliding_window": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torchscript": false, + "transformers_version": "4.51.0", + "typical_p": 1.0, + "use_bfloat16": false, + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 151936 +} diff --git a/weights/tp0_sharded_checkpoint.safetensors b/weights/tp0_sharded_checkpoint.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8f699d3d5126985bc389be92ded6c6e837bef36 --- /dev/null +++ b/weights/tp0_sharded_checkpoint.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cb11f2bf623c9d034d00128c3d6f62155d809f6f8285d968ba924382948ccfb +size 2031901276 diff --git a/weights/tp1_sharded_checkpoint.safetensors b/weights/tp1_sharded_checkpoint.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..831665afe2d5007c5b7952c869a201b4f616d089 --- /dev/null +++ b/weights/tp1_sharded_checkpoint.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:404dde8bb0a4d4ca93e2a337362f0b0a15c14716f1b133a89f10265e0776d1c6 +size 2031901276